mirror of
https://github.com/feeddeck/feeddeck.git
synced 2026-05-26 21:55:08 -05:00
For the item ingestion logic we are now also looking at the published date for all entries in a RSS feed. If the published date is older then the last update date of a source the item will be skipped. This way we should reduce the number of duplicated items, where an entry was updated in the RSS feed, so that it had a new id and was written again to the database and not only updated.
154 lines
4.7 KiB
TypeScript
154 lines
4.7 KiB
TypeScript
import { SupabaseClient } from "@supabase/supabase-js";
|
|
import { parseFeed } from "rss";
|
|
import { FeedEntry } from "rss/types";
|
|
import { Md5 } from "std/md5";
|
|
import { Redis } from "redis";
|
|
import { unescape } from "lodash";
|
|
|
|
import { ISource } from "../models/source.ts";
|
|
import { IItem } from "../models/item.ts";
|
|
import { IProfile } from "../models/profile.ts";
|
|
import { fetchWithTimeout } from "../utils/fetchWithTimeout.ts";
|
|
import { log } from "../utils/log.ts";
|
|
|
|
export const getStackoverflowFeed = async (
|
|
_supabaseClient: SupabaseClient,
|
|
_redisClient: Redis | undefined,
|
|
_profile: IProfile,
|
|
source: ISource,
|
|
): Promise<{ source: ISource; items: IItem[] }> => {
|
|
if (!source.options?.stackoverflow || !source.options?.stackoverflow?.type) {
|
|
throw new Error("Invalid source options");
|
|
}
|
|
|
|
if (source.options.stackoverflow.type === "tag") {
|
|
source.options.stackoverflow.url =
|
|
`https://stackoverflow.com/feeds/tag?tagnames=${source.options.stackoverflow.tag}&sort=${source.options.stackoverflow.sort}`;
|
|
}
|
|
|
|
if (!source.options?.stackoverflow.url) {
|
|
throw new Error("Invalid source options");
|
|
}
|
|
|
|
/**
|
|
* Get the RSS for the provided `stackoverflow` url and parse it. If a feed doesn't contains an item we return an error.
|
|
*/
|
|
const response = await fetchWithTimeout(source.options.stackoverflow.url, {
|
|
method: "get",
|
|
}, 5000);
|
|
const xml = await response.text();
|
|
log("debug", "Add source", {
|
|
sourceType: "stackoverflow",
|
|
requestUrl: source.options.stackoverflow.url,
|
|
responseStatus: response.status,
|
|
});
|
|
const feed = await parseFeed(xml);
|
|
|
|
if (!feed.title.value) {
|
|
throw new Error("Invalid feed");
|
|
}
|
|
|
|
/**
|
|
* Generate a source id based on the user id, column id and the normalized `stackoverflow` url. Besides that we also
|
|
* set the source type to `stackoverflow` and set the title and link for the source.
|
|
*/
|
|
if (source.id === "") {
|
|
source.id = generateSourceId(
|
|
source.userId,
|
|
source.columnId,
|
|
source.options.stackoverflow.url,
|
|
);
|
|
}
|
|
source.type = "stackoverflow";
|
|
source.title = feed.title.value;
|
|
if (feed.links.length > 0) {
|
|
source.link = feed.links[0];
|
|
}
|
|
source.icon = undefined;
|
|
|
|
/**
|
|
* Now that the source does contain all the required information we can start to generate the items for the source, by
|
|
* looping over all the feed entries.
|
|
*/
|
|
const items: IItem[] = [];
|
|
|
|
for (const [index, entry] of feed.entries.entries()) {
|
|
if (skipEntry(index, entry, source.updatedAt || 0)) {
|
|
continue;
|
|
}
|
|
|
|
/**
|
|
* Create the item object and add it to the `items` array.
|
|
*/
|
|
items.push({
|
|
id: generateItemId(source.id, entry.id),
|
|
userId: source.userId,
|
|
columnId: source.columnId,
|
|
sourceId: source.id,
|
|
title: entry.title!.value!,
|
|
link: entry.links[0].href!,
|
|
media: undefined,
|
|
description: entry.description?.value
|
|
? unescape(entry.description.value)
|
|
: undefined,
|
|
author: undefined,
|
|
publishedAt: Math.floor(entry.published!.getTime() / 1000),
|
|
});
|
|
}
|
|
|
|
return { source, items };
|
|
};
|
|
|
|
/**
|
|
* `skipEntry` is used to determin if an entry should be skipped or not. When a entry in the RSS feed is skipped it will
|
|
* not be added to the database. An entry will be skipped when
|
|
* - it is not within the first 50 entries of the feed, because we only keep the last 50 items of each source in our
|
|
* delete logic.
|
|
* - the entry does not contain a title, a link or a published date.
|
|
* - the published date of the entry is older than the last update date of the source minus 10 seconds.
|
|
*/
|
|
const skipEntry = (
|
|
index: number,
|
|
entry: FeedEntry,
|
|
sourceUpdatedAt: number,
|
|
): boolean => {
|
|
if (index === 50) {
|
|
return true;
|
|
}
|
|
|
|
if (
|
|
!entry.title?.value ||
|
|
(entry.links.length === 0 || !entry.links[0].href) || !entry.published
|
|
) {
|
|
return true;
|
|
}
|
|
|
|
if (Math.floor(entry.published.getTime() / 1000) <= (sourceUpdatedAt - 10)) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
};
|
|
|
|
/**
|
|
* `generateSourceId` generates a unique source id based on the user id, column id and the link of the RSS feed. We use
|
|
* the MD5 algorithm for the link to generate the id.
|
|
*/
|
|
const generateSourceId = (
|
|
userId: string,
|
|
columnId: string,
|
|
link: string,
|
|
): string => {
|
|
return `stackoverflow-${userId}-${columnId}-${
|
|
new Md5().update(link).toString()
|
|
}`;
|
|
};
|
|
|
|
/**
|
|
* `generateItemId` generates a unique item id based on the source id and the identifier of the item. We use the MD5
|
|
* algorithm for the identifier, which can be the link of the item or the id of the item.
|
|
*/
|
|
const generateItemId = (sourceId: string, identifier: string): string => {
|
|
return `${sourceId}-${new Md5().update(identifier).toString()}`;
|
|
};
|