mirror of
https://github.com/feeddeck/feeddeck.git
synced 2026-04-28 18:38:34 -05:00
This commit adds tests for all available sources. This commit also fixes the parsing of Atom feeds for the RSS source, where the `dc:date` field must be used for the `publishedAt` field.
402 lines
12 KiB
TypeScript
402 lines
12 KiB
TypeScript
import { SupabaseClient } from '@supabase/supabase-js';
|
|
import { Feed, parseFeed } from 'rss';
|
|
import { Md5 } from 'std/md5';
|
|
import { FeedEntry } from 'rss/types';
|
|
import { Redis } from 'redis';
|
|
import { unescape } from 'lodash';
|
|
import * as cheerio from 'cheerio';
|
|
|
|
import { IItem } from '../models/item.ts';
|
|
import { ISource } from '../models/source.ts';
|
|
import { feedutils } from './utils/index.ts';
|
|
import { IProfile } from '../models/profile.ts';
|
|
import { utils } from '../utils/index.ts';
|
|
|
|
export const getRSSFeed = async (
|
|
supabaseClient: SupabaseClient,
|
|
_redisClient: Redis | undefined,
|
|
_profile: IProfile,
|
|
source: ISource,
|
|
): Promise<{ source: ISource; items: IItem[] }> => {
|
|
/**
|
|
* To get a RSS feed the `source` must have a `rss` option. This option is
|
|
* then passed to the `parseFeed` function of the `rss` package to get the
|
|
* feed.
|
|
*/
|
|
if (!source.options?.rss) {
|
|
throw new Error('Invalid source options');
|
|
}
|
|
|
|
let feed = await getFeed(source);
|
|
if (!feed) {
|
|
utils.log(
|
|
'debug',
|
|
'Failed to get RSS feed, try to get RSS feed from website',
|
|
{ requestUrl: source.options.rss },
|
|
);
|
|
feed = await getFeedFromWebsite(source);
|
|
if (!feed) {
|
|
throw new Error('Failed to get RSS feed');
|
|
}
|
|
}
|
|
|
|
/**
|
|
* If the feed does not have a title we consider it invalid and throw an
|
|
* error.
|
|
*/
|
|
if (!feed.title.value) {
|
|
throw new Error('Invalid feed');
|
|
}
|
|
|
|
/**
|
|
* If the provided source does not already have an id we generate one using
|
|
* the `generateSourceId` function. The id of a source is a combination of the
|
|
* user id, the column id and the link of the RSS feed. We also set the type
|
|
* of the source to `rss` and the title to the title of the feed.
|
|
*/
|
|
if (source.id === '') {
|
|
source.id = generateSourceId(
|
|
source.userId,
|
|
source.columnId,
|
|
source.options.rss,
|
|
);
|
|
}
|
|
source.type = 'rss';
|
|
source.title = feed.title.value;
|
|
|
|
/**
|
|
* If the feed contains a list of links we are using the first one as the link
|
|
* for our source.
|
|
*/
|
|
if (feed.links.length > 0) {
|
|
source.link = feed.links[0];
|
|
}
|
|
|
|
/**
|
|
* If the source doesn't already contain an icon, we try to get an icon via
|
|
* the `source.link` via our `getFavicon` function. If that fails we try to
|
|
* use the icon or image of the feed. If we are able to get an icon we upload
|
|
* it to our CDN and set the `source.icon` to the URL of the uploaded icon.
|
|
*
|
|
* Note: We try to use the `getFavicon` function first, because the most RSS
|
|
* feeds do not contain a proper icon so that a favicon looks better than the
|
|
* feed icon / image within the UI.
|
|
*/
|
|
if (!source.icon) {
|
|
if (source.link) {
|
|
const favicon = await feedutils.getFavicon(source.link);
|
|
if (favicon && favicon.url.startsWith('https://')) {
|
|
source.icon = favicon.url;
|
|
}
|
|
}
|
|
|
|
if (!source.icon) {
|
|
if (feed.icon && feed.icon.startsWith('https://')) {
|
|
source.icon = feed.icon;
|
|
} else if (feed.image?.url && feed.image.url.startsWith('https://')) {
|
|
source.icon = feed.image?.url;
|
|
}
|
|
}
|
|
|
|
source.icon = await feedutils.uploadSourceIcon(supabaseClient, source);
|
|
}
|
|
|
|
/**
|
|
* Now that the source contains all the required fields we can loop through
|
|
* all the items and add them for the source.
|
|
*/
|
|
const items: IItem[] = [];
|
|
|
|
for (const [index, entry] of feed.entries.entries()) {
|
|
if (skipEntry(index, entry, source.updatedAt || 0)) {
|
|
continue;
|
|
}
|
|
|
|
/**
|
|
* Each item need a unique id which is generated using the `generateItemId`
|
|
* function. The id is a combination of the source id and the id of the
|
|
* entry or if the entry does not have an id we use the link of the first
|
|
* link of the entry.
|
|
*/
|
|
let itemId = '';
|
|
if (entry.id) {
|
|
itemId = generateItemId(source.id, entry.id);
|
|
} else {
|
|
itemId = generateItemId(source.id, entry.links[0].href!);
|
|
}
|
|
|
|
/**
|
|
* Create the item object and add it to the `items` array.
|
|
*/
|
|
items.push({
|
|
id: itemId,
|
|
userId: source.userId,
|
|
columnId: source.columnId,
|
|
sourceId: source.id,
|
|
title: entry.title!.value!,
|
|
link: entry.links[0].href!,
|
|
media: getMedia(entry),
|
|
description: getItemDescription(entry),
|
|
author: entry.author?.name,
|
|
publishedAt: entry.published
|
|
? Math.floor(entry.published.getTime() / 1000)
|
|
: entry.updated
|
|
? Math.floor(entry.updated.getTime() / 1000)
|
|
: entry['dc:date']
|
|
? getDCDateTimestamp(entry['dc:date'])
|
|
: Math.floor(new Date().getTime() / 1000),
|
|
});
|
|
}
|
|
|
|
return { source, items };
|
|
};
|
|
|
|
/**
|
|
* `getFeed` is a helper function to get a RSS feed for a source. It returns
|
|
* the feed or undefined if the request failed or the returned response could
|
|
* not be parsed as a feed.
|
|
*/
|
|
const getFeed = async (source: ISource): Promise<Feed | undefined> => {
|
|
try {
|
|
const response = await utils.fetchWithTimeout(
|
|
source.options!.rss!,
|
|
{ method: 'get' },
|
|
5000,
|
|
);
|
|
const xml = await response.text();
|
|
utils.log('debug', 'Add source', {
|
|
sourceType: 'rss',
|
|
requestUrl: source.options!.rss!,
|
|
responseStatus: response.status,
|
|
});
|
|
const feed = await parseFeed(xml);
|
|
return feed;
|
|
} catch (_) {
|
|
return undefined;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* `getFeedFromWebsite` is a helper function to get a RSS feed from a website.
|
|
* This function can be used to get the RSS feed after the call to `getFeed`
|
|
* failed. This could happen when a user provided an url to a website instead of
|
|
* a RSS feed.
|
|
*
|
|
* In the function we are checking if there is a
|
|
* `<link rel="alternate" type="application/rss+xml" href="RSS_FEED_URL">` tag
|
|
* on the website. If this is the case we are using the `href` attribute and try
|
|
* to get the RSS feed from that url via the `getFeed` function.
|
|
*
|
|
* When we construct the RSS feed url we have to ensure, that the url is
|
|
* absolute.
|
|
*/
|
|
const getFeedFromWebsite = async (
|
|
source: ISource,
|
|
): Promise<Feed | undefined> => {
|
|
try {
|
|
const response = await utils.fetchWithTimeout(
|
|
source.options!.rss!,
|
|
{ method: 'get' },
|
|
5000,
|
|
);
|
|
const html = await response.text();
|
|
|
|
const $ = cheerio.load(html);
|
|
const rssLink = $('link[type="application/rss+xml"]').attr('href');
|
|
if (!rssLink) {
|
|
return undefined;
|
|
}
|
|
source.options!.rss = new URL(rssLink, source.options!.rss!).href;
|
|
|
|
return getFeed(source);
|
|
} catch (_) {
|
|
return undefined;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* `skipEntry` is used to determin if an entry should be skipped or not. When a
|
|
* entry in the RSS feed is skipped it will not be added to the database. An
|
|
* entry will be skipped when
|
|
* - it is not within the first 50 entries of the feed, because we only keep the
|
|
* last 50 items of each source in our delete logic.
|
|
* - the entry does not contain a title, a link or a published / updated date.
|
|
* - the published / updated date of the entry is older than the last update
|
|
* date of the source minus 10 seconds.
|
|
*/
|
|
const skipEntry = (
|
|
index: number,
|
|
entry: FeedEntry,
|
|
sourceUpdatedAt: number,
|
|
): boolean => {
|
|
if (index === 50) {
|
|
return true;
|
|
}
|
|
|
|
if (
|
|
!entry.title?.value ||
|
|
(entry.links.length === 0 || !entry.links[0].href) ||
|
|
(!entry.published && !entry.updated && !entry['dc:date'])
|
|
) {
|
|
return true;
|
|
}
|
|
|
|
if (
|
|
entry.published &&
|
|
Math.floor(entry.published.getTime() / 1000) <= (sourceUpdatedAt - 10)
|
|
) {
|
|
return true;
|
|
} else if (
|
|
entry.updated &&
|
|
Math.floor(entry.updated.getTime() / 1000) <= (sourceUpdatedAt - 10)
|
|
) {
|
|
return true;
|
|
} else if (
|
|
entry['dc:date'] &&
|
|
getDCDateTimestamp(entry['dc:date']) <= (sourceUpdatedAt - 10)
|
|
) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
};
|
|
|
|
/**
|
|
* `getDCDateTimestamp` is a helper function to get the timestamp of a `dc:date`
|
|
* tag. The `dc:date` tag can either be a `Date` object or an object with a
|
|
* `value` property which is a `Date` object.
|
|
*/
|
|
const getDCDateTimestamp = (dcdate: Date | { value: Date }): number => {
|
|
if (dcdate instanceof Date) {
|
|
return Math.floor(dcdate.getTime() / 1000);
|
|
} else {
|
|
return Math.floor(dcdate.value.getTime() / 1000);
|
|
}
|
|
};
|
|
|
|
/**
|
|
* `generateSourceId` generates a unique source id based on the user id, column
|
|
* id and the link of the RSS feed. We use the MD5 algorithm for the link to
|
|
* generate the id.
|
|
*/
|
|
const generateSourceId = (
|
|
userId: string,
|
|
columnId: string,
|
|
link: string,
|
|
): string => {
|
|
return `rss-${userId}-${columnId}-${new Md5().update(link).toString()}`;
|
|
};
|
|
|
|
/**
|
|
* `generateItemId` generates a unique item id based on the source id and the
|
|
* identifier of the item. We use the MD5 algorithm for the identifier, which
|
|
* can be the link of the item or the id of the item.
|
|
*/
|
|
const generateItemId = (sourceId: string, identifier: string): string => {
|
|
return `${sourceId}-${new Md5().update(identifier).toString()}`;
|
|
};
|
|
|
|
/**
|
|
* `getItemDescription` returns the description of an item based on the provided
|
|
* description and content. In the first step we try to use the description of
|
|
* the items as our description. If that is not available, we try to use the
|
|
* content. If that is not available, we return undefined. We also remove all
|
|
* HTML tags from the description and content before returning it.
|
|
*/
|
|
const getItemDescription = (entry: FeedEntry): string | undefined => {
|
|
if (entry.description?.value) {
|
|
return unescape(entry.description?.value.replace(/(<([^>]+)>)/ig, ''));
|
|
}
|
|
|
|
if (entry.content?.value) {
|
|
return unescape(entry.content?.value.replace(/(<([^>]+)>)/ig, ''));
|
|
}
|
|
|
|
return undefined;
|
|
};
|
|
|
|
/**
|
|
* `getMedia` returns a media url for the provided feed `entry` (item). To get
|
|
* the media we check all the different media tags that are available in the
|
|
* feed. If we find a media tag with a medium of `image` we return the url of
|
|
* that tag. If we don't find any media tags with a medium of `image` we check
|
|
* the attachements of the feed entry. If we do not find an image there we
|
|
* finally check if the description or content contains an `img` tag to use it
|
|
* for the media field.
|
|
*/
|
|
const getMedia = (entry: FeedEntry): string | undefined => {
|
|
if (entry['media:content'] && entry['media:content'].length > 0) {
|
|
for (const media of entry['media:content']) {
|
|
if (
|
|
media.medium && media.medium === 'image' && media.url &&
|
|
media.url.startsWith('https://') && !media.url.endsWith('.svg')
|
|
) {
|
|
return media.url;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (
|
|
entry['media:thumbnails'] && entry['media:thumbnails'].url &&
|
|
entry['media:thumbnails'].url.startsWith('https://')
|
|
) {
|
|
return entry['media:thumbnails'].url;
|
|
}
|
|
|
|
if (entry['media:group'] && entry['media:group'].length > 0) {
|
|
for (const mediaGroup of entry['media:group']) {
|
|
if (mediaGroup['media:content']) {
|
|
for (const mediaContent of mediaGroup['media:content']) {
|
|
if (
|
|
mediaContent.medium && mediaContent.medium === 'image' &&
|
|
mediaContent.url &&
|
|
mediaContent.url.startsWith('https://') &&
|
|
!mediaContent.url.endsWith('.svg')
|
|
) {
|
|
return mediaContent.url;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (entry.attachments && entry.attachments.length > 0) {
|
|
for (const attachment of entry.attachments) {
|
|
if (
|
|
attachment.mimeType && attachment.mimeType.startsWith('image/') &&
|
|
attachment.url &&
|
|
attachment.url.startsWith('https://') &&
|
|
!attachment.url.endsWith('.svg')
|
|
) {
|
|
return attachment.url;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (entry.description?.value) {
|
|
const matches = /<img[^>]+\bsrc=["']([^"']+)["']/.exec(
|
|
unescape(entry.description.value),
|
|
);
|
|
if (
|
|
matches && matches.length == 2 && matches[1].startsWith('https://') &&
|
|
!matches[1].endsWith('.svg')
|
|
) {
|
|
return matches[1];
|
|
}
|
|
}
|
|
|
|
if (entry.content?.value) {
|
|
const matches = /<img[^>]+\bsrc=["']([^"']+)["']/.exec(
|
|
unescape(entry.content.value),
|
|
);
|
|
if (
|
|
matches && matches.length == 2 && matches[1].startsWith('https://') &&
|
|
!matches[1].endsWith('.svg')
|
|
) {
|
|
return matches[1];
|
|
}
|
|
}
|
|
|
|
return undefined;
|
|
};
|