mirror of
https://github.com/feeddeck/feeddeck.git
synced 2026-03-11 17:47:47 -05:00
This commit adds tests for all available sources. This commit also fixes the parsing of Atom feeds for the RSS source, where the `dc:date` field must be used for the `publishedAt` field.
309 lines
8.9 KiB
TypeScript
309 lines
8.9 KiB
TypeScript
import { SupabaseClient } from '@supabase/supabase-js';
|
|
import { parseFeed } from 'rss';
|
|
import { Md5 } from 'std/md5';
|
|
import { FeedEntry } from 'rss/types';
|
|
import { Redis } from 'redis';
|
|
import { unescape } from 'lodash';
|
|
|
|
import { IItem } from '../models/item.ts';
|
|
import { ISource } from '../models/source.ts';
|
|
import { Favicon, feedutils } from './utils/index.ts';
|
|
import { IProfile } from '../models/profile.ts';
|
|
import { utils } from '../utils/index.ts';
|
|
|
|
/**
|
|
* `faviconFilter` is a filter function for the favicons. It filters out all the
|
|
* favicons which are not hosted on the Medium CDN.
|
|
*/
|
|
export const faviconFilter = (favicons: Favicon[]): Favicon[] => {
|
|
return favicons.filter((favicon) => {
|
|
return favicon.url.startsWith('https://cdn-images');
|
|
});
|
|
};
|
|
|
|
/**
|
|
* `parseMediumOption` parses the provided `medium` option and returns a valid
|
|
* Medium feed url. The `medium` option can be a Medium url, a Medium tag or a
|
|
* Medium username. If the provided option is not valid we throw an error.
|
|
*/
|
|
export const parseMediumOption = (input?: string): string => {
|
|
if (input) {
|
|
if (input.length > 1 && input[0] === '#') {
|
|
return `https://medium.com/feed/tag/${input.slice(1)}`;
|
|
} else if (input.length > 1 && input[0] === '@') {
|
|
return `https://medium.com/feed/${input}`;
|
|
} else {
|
|
const parsedUrl = new URL(input);
|
|
const parsedHostname = parsedUrl.hostname.split('.');
|
|
if (
|
|
parsedHostname.length === 2 && parsedHostname[0] === 'medium' &&
|
|
parsedHostname[1] === 'com'
|
|
) {
|
|
return `https://medium.com/feed/${
|
|
input.replace('https://medium.com/', '').replace('feed/', '')
|
|
}`;
|
|
} else if (
|
|
parsedHostname.length === 3 && parsedHostname[1] === 'medium' &&
|
|
parsedHostname[2] === 'com'
|
|
) {
|
|
return `https://${parsedHostname[0]}.medium.com/feed`;
|
|
} else {
|
|
throw new Error('Invalid source options');
|
|
}
|
|
}
|
|
} else {
|
|
throw new Error('Invalid source options');
|
|
}
|
|
};
|
|
|
|
/**
|
|
* `isMediumUrl` checks if the provided `url` is a valid Medium url. A url is
|
|
* considered valid if the hostname starts with `medium.com`.
|
|
*/
|
|
export const isMediumUrl = (url: string): boolean => {
|
|
const parsedUrl = new URL(url);
|
|
return parsedUrl.hostname.endsWith('medium.com');
|
|
};
|
|
|
|
export const getMediumFeed = async (
|
|
supabaseClient: SupabaseClient,
|
|
_redisClient: Redis | undefined,
|
|
_profile: IProfile,
|
|
source: ISource,
|
|
): Promise<{ source: ISource; items: IItem[] }> => {
|
|
const parsedMediumOption = parseMediumOption(source.options?.medium);
|
|
|
|
/**
|
|
* Get the RSS for the provided `medium` url and parse it. If a feed doesn't
|
|
* contains an item we return an error.
|
|
*/
|
|
const response = await utils.fetchWithTimeout(parsedMediumOption, {
|
|
method: 'get',
|
|
}, 5000);
|
|
const xml = await response.text();
|
|
utils.log('debug', 'Add source', {
|
|
sourceType: 'medium',
|
|
requestUrl: parsedMediumOption,
|
|
responseStatus: response.status,
|
|
});
|
|
const feed = await parseFeed(xml);
|
|
|
|
if (!feed.title.value) {
|
|
throw new Error('Invalid feed');
|
|
}
|
|
|
|
/**
|
|
* When the source doesn't has an id yet we try to get an favicon from the
|
|
* feed for the source. We check if the source has an id because we only want
|
|
* to try to get the favicon when the source is created the first time.
|
|
*/
|
|
if (source.id === '' && feed.links.length > 0) {
|
|
const favicon = await feedutils.getFavicon(feed.links[0], faviconFilter);
|
|
|
|
if (favicon && favicon.url.startsWith('https://')) {
|
|
source.icon = favicon.url;
|
|
source.icon = await feedutils.uploadSourceIcon(supabaseClient, source);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate a source id based on the user id, column id and the normalized
|
|
* `medium` url. Besides that we also set the source type to `medium` and set
|
|
* the title and link for the source.
|
|
*/
|
|
if (source.id === '') {
|
|
source.id = generateSourceId(
|
|
source.userId,
|
|
source.columnId,
|
|
parsedMediumOption,
|
|
);
|
|
}
|
|
source.type = 'medium';
|
|
source.title = feed.title.value;
|
|
source.options = { medium: parsedMediumOption };
|
|
if (feed.links.length > 0) {
|
|
source.link = feed.links[0];
|
|
}
|
|
|
|
/**
|
|
* Now that the source does contain all the required information we can start
|
|
* to generate the items for the source, by looping over all the feed entries.
|
|
*/
|
|
const items: IItem[] = [];
|
|
|
|
for (const [index, entry] of feed.entries.entries()) {
|
|
if (skipEntry(index, entry, source.updatedAt || 0)) {
|
|
continue;
|
|
}
|
|
|
|
/**
|
|
* Each item need a unique id which is generated using the `generateItemId`
|
|
* function. The id is a combination of the source id and the id of the
|
|
* entry or if the entry does not have an id we use the link of the first
|
|
* link of the entry.
|
|
*/
|
|
let itemId = '';
|
|
if (entry.id != '') {
|
|
itemId = generateItemId(source.id, entry.id);
|
|
} else if (entry.links.length > 0 && entry.links[0].href) {
|
|
itemId = generateItemId(source.id, entry.links[0].href);
|
|
} else {
|
|
continue;
|
|
}
|
|
|
|
/**
|
|
* Create the item object and add it to the `items` array.
|
|
*/
|
|
items.push({
|
|
id: itemId,
|
|
userId: source.userId,
|
|
columnId: source.columnId,
|
|
sourceId: source.id,
|
|
title: entry.title!.value!,
|
|
link: entry.links[0].href!,
|
|
media: getMedia(entry),
|
|
description: getItemDescription(entry),
|
|
author: entry['dc:creator']?.join(', '),
|
|
publishedAt: Math.floor(entry.published!.getTime() / 1000),
|
|
});
|
|
}
|
|
|
|
return { source, items };
|
|
};
|
|
|
|
/**
|
|
* `skipEntry` is used to determin if an entry should be skipped or not. When a
|
|
* entry in the RSS feed is skipped it will not be added to the database. An
|
|
* entry will be skipped when
|
|
* - it is not within the first 50 entries of the feed, because we only keep the
|
|
* last 50 items of each source in our
|
|
* delete logic.
|
|
* - the entry does not contain a title, a link or a published date.
|
|
* - the published date of the entry is older than the last update date of the
|
|
* source minus 10 seconds.
|
|
*/
|
|
const skipEntry = (
|
|
index: number,
|
|
entry: FeedEntry,
|
|
sourceUpdatedAt: number,
|
|
): boolean => {
|
|
if (index === 50) {
|
|
return true;
|
|
}
|
|
|
|
if (
|
|
!entry.title?.value ||
|
|
(entry.links.length === 0 || !entry.links[0].href) || !entry.published
|
|
) {
|
|
return true;
|
|
}
|
|
|
|
if (Math.floor(entry.published.getTime() / 1000) <= (sourceUpdatedAt - 10)) {
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Skip entries which might be spam. To detect possible spam, we check the
|
|
* title of the entry against a list of words, when the title contains 3 or
|
|
* more of these words we consider the entry as spam.
|
|
*/
|
|
const filterWords = [
|
|
'cash',
|
|
'loan',
|
|
'customer',
|
|
'care',
|
|
'helpline',
|
|
'number',
|
|
'patti',
|
|
'toll',
|
|
'free',
|
|
'paisa',
|
|
'call',
|
|
'kup',
|
|
'niewykrywalnych',
|
|
'fałszywych',
|
|
'pieniędzy',
|
|
'whatsapp',
|
|
'money',
|
|
];
|
|
const title = entry.title.value.toLowerCase();
|
|
let score = 0;
|
|
|
|
for (const word of filterWords) {
|
|
if (title.includes(word)) {
|
|
score += 1;
|
|
}
|
|
}
|
|
if (score >= 3) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
};
|
|
|
|
/**
|
|
* `generateSourceId` generates a unique source id based on the user id, column
|
|
* id and the link of the RSS feed. We use the MD5 algorithm for the link to
|
|
* generate the id.
|
|
*/
|
|
const generateSourceId = (
|
|
userId: string,
|
|
columnId: string,
|
|
link: string,
|
|
): string => {
|
|
return `medium-${userId}-${columnId}-${new Md5().update(link).toString()}`;
|
|
};
|
|
|
|
/**
|
|
* `generateItemId` generates a unique item id based on the source id and the
|
|
* identifier of the item. We use the MD5 algorithm for the identifier, which
|
|
* can be the link of the item or the id of the item.
|
|
*/
|
|
const generateItemId = (sourceId: string, identifier: string): string => {
|
|
return `${sourceId}-${new Md5().update(identifier).toString()}`;
|
|
};
|
|
|
|
/**
|
|
* `getItemDescription` returns the description of the item. If the item has a
|
|
* `content` property we use that as our description, otherwise we use the
|
|
* `description` property.
|
|
*/
|
|
const getItemDescription = (entry: FeedEntry): string | undefined => {
|
|
if (entry.content?.value) {
|
|
return unescape(entry.content.value);
|
|
}
|
|
|
|
if (entry.description?.value) {
|
|
return unescape(entry.description.value);
|
|
}
|
|
|
|
return undefined;
|
|
};
|
|
|
|
/**
|
|
* `getMedia` returns an image for the provided feed entry from it's content or
|
|
* description. If we could not get an image from the content or description we
|
|
* return `undefined`.
|
|
*/
|
|
const getMedia = (entry: FeedEntry): string | undefined => {
|
|
if (entry.content?.value) {
|
|
const matches = /<img[^>]+\bsrc=["']([^"']+)["']/.exec(
|
|
unescape(entry.content.value),
|
|
);
|
|
if (matches && matches.length == 2 && matches[1].startsWith('https://')) {
|
|
return matches[1];
|
|
}
|
|
}
|
|
|
|
if (entry.description?.value) {
|
|
const matches = /<img[^>]+\bsrc=["']([^"']+)["']/.exec(
|
|
unescape(entry.description.value),
|
|
);
|
|
if (matches && matches.length == 2 && matches[1].startsWith('https://')) {
|
|
return matches[1];
|
|
}
|
|
}
|
|
|
|
return undefined;
|
|
};
|