mirror of
https://github.com/feeddeck/feeddeck.git
synced 2026-03-09 07:02:01 -05:00
Prefer content over description for Item Description (#243)
Until now, we always checked the `description` field of an item first to get our description value. If the `description` field is present we directly used it and never checked the `content` field. Now we are checking the `content` field first and afterwards the `description` field. This way by default the `content` field is used as our description. This should be more common to get the full article text when both fields are present in an RSS feed. When both fields are present and the `description` field is longer then the `content` field, we will still prefer the `description` field.
This commit is contained in:
@@ -1,15 +1,15 @@
|
|||||||
import { SupabaseClient } from '@supabase/supabase-js';
|
import { SupabaseClient } from "@supabase/supabase-js";
|
||||||
import { Feed } from 'rss';
|
import { Feed } from "rss";
|
||||||
import { FeedEntry } from 'rss/types';
|
import { FeedEntry } from "rss/types";
|
||||||
import { Redis } from 'redis';
|
import { Redis } from "redis";
|
||||||
import { unescape } from 'lodash';
|
import { unescape } from "lodash";
|
||||||
import * as cheerio from 'cheerio';
|
import * as cheerio from "cheerio";
|
||||||
|
|
||||||
import { IItem } from '../models/item.ts';
|
import { IItem } from "../models/item.ts";
|
||||||
import { ISource } from '../models/source.ts';
|
import { ISource } from "../models/source.ts";
|
||||||
import { feedutils } from './utils/index.ts';
|
import { feedutils } from "./utils/index.ts";
|
||||||
import { IProfile } from '../models/profile.ts';
|
import { IProfile } from "../models/profile.ts";
|
||||||
import { utils } from '../utils/index.ts';
|
import { utils } from "../utils/index.ts";
|
||||||
|
|
||||||
export const getRSSFeed = async (
|
export const getRSSFeed = async (
|
||||||
supabaseClient: SupabaseClient,
|
supabaseClient: SupabaseClient,
|
||||||
@@ -24,19 +24,19 @@ export const getRSSFeed = async (
|
|||||||
* feed.
|
* feed.
|
||||||
*/
|
*/
|
||||||
if (!source.options?.rss) {
|
if (!source.options?.rss) {
|
||||||
throw new feedutils.FeedValidationError('Invalid source options');
|
throw new feedutils.FeedValidationError("Invalid source options");
|
||||||
}
|
}
|
||||||
|
|
||||||
let feed = await getFeed(source, feedData);
|
let feed = await getFeed(source, feedData);
|
||||||
if (!feed) {
|
if (!feed) {
|
||||||
utils.log(
|
utils.log(
|
||||||
'debug',
|
"debug",
|
||||||
'Failed to get RSS feed, try to get RSS feed from website',
|
"Failed to get RSS feed, try to get RSS feed from website",
|
||||||
{ requestUrl: source.options.rss },
|
{ requestUrl: source.options.rss },
|
||||||
);
|
);
|
||||||
feed = await getFeedFromWebsite(source);
|
feed = await getFeedFromWebsite(source);
|
||||||
if (!feed) {
|
if (!feed) {
|
||||||
throw new Error('Failed to get RSS feed');
|
throw new Error("Failed to get RSS feed");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -45,7 +45,7 @@ export const getRSSFeed = async (
|
|||||||
* error.
|
* error.
|
||||||
*/
|
*/
|
||||||
if (!feed.title.value) {
|
if (!feed.title.value) {
|
||||||
throw new Error('Invalid feed');
|
throw new Error("Invalid feed");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -54,14 +54,14 @@ export const getRSSFeed = async (
|
|||||||
* user id, the column id and the link of the RSS feed. We also set the type
|
* user id, the column id and the link of the RSS feed. We also set the type
|
||||||
* of the source to `rss` and the title to the title of the feed.
|
* of the source to `rss` and the title to the title of the feed.
|
||||||
*/
|
*/
|
||||||
if (source.id === '') {
|
if (source.id === "") {
|
||||||
source.id = await generateSourceId(
|
source.id = await generateSourceId(
|
||||||
source.userId,
|
source.userId,
|
||||||
source.columnId,
|
source.columnId,
|
||||||
source.options.rss,
|
source.options.rss,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
source.type = 'rss';
|
source.type = "rss";
|
||||||
source.title = feed.title.value;
|
source.title = feed.title.value;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -85,15 +85,15 @@ export const getRSSFeed = async (
|
|||||||
if (!source.icon) {
|
if (!source.icon) {
|
||||||
if (source.link) {
|
if (source.link) {
|
||||||
const favicon = await feedutils.getFavicon(source.link);
|
const favicon = await feedutils.getFavicon(source.link);
|
||||||
if (favicon && favicon.url.startsWith('https://')) {
|
if (favicon && favicon.url.startsWith("https://")) {
|
||||||
source.icon = favicon.url;
|
source.icon = favicon.url;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!source.icon) {
|
if (!source.icon) {
|
||||||
if (feed.icon && feed.icon.startsWith('https://')) {
|
if (feed.icon && feed.icon.startsWith("https://")) {
|
||||||
source.icon = feed.icon;
|
source.icon = feed.icon;
|
||||||
} else if (feed.image?.url && feed.image.url.startsWith('https://')) {
|
} else if (feed.image?.url && feed.image.url.startsWith("https://")) {
|
||||||
source.icon = feed.image?.url;
|
source.icon = feed.image?.url;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -118,7 +118,7 @@ export const getRSSFeed = async (
|
|||||||
* entry or if the entry does not have an id we use the link of the first
|
* entry or if the entry does not have an id we use the link of the first
|
||||||
* link of the entry.
|
* link of the entry.
|
||||||
*/
|
*/
|
||||||
let itemId = '';
|
let itemId = "";
|
||||||
if (entry.id) {
|
if (entry.id) {
|
||||||
itemId = await generateItemId(source.id, entry.id);
|
itemId = await generateItemId(source.id, entry.id);
|
||||||
} else {
|
} else {
|
||||||
@@ -147,10 +147,10 @@ export const getRSSFeed = async (
|
|||||||
publishedAt: entry.published
|
publishedAt: entry.published
|
||||||
? Math.floor(entry.published.getTime() / 1000)
|
? Math.floor(entry.published.getTime() / 1000)
|
||||||
: entry.updated
|
: entry.updated
|
||||||
? Math.floor(entry.updated.getTime() / 1000)
|
? Math.floor(entry.updated.getTime() / 1000)
|
||||||
: entry['dc:date']
|
: entry["dc:date"]
|
||||||
? getDCDateTimestamp(entry['dc:date'])
|
? getDCDateTimestamp(entry["dc:date"])
|
||||||
: Math.floor(new Date().getTime() / 1000),
|
: Math.floor(new Date().getTime() / 1000),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -198,17 +198,17 @@ const getFeedFromWebsite = async (
|
|||||||
try {
|
try {
|
||||||
const response = await utils.fetchWithTimeout(
|
const response = await utils.fetchWithTimeout(
|
||||||
source.options!.rss!,
|
source.options!.rss!,
|
||||||
{ method: 'get' },
|
{ method: "get" },
|
||||||
5000,
|
5000,
|
||||||
);
|
);
|
||||||
const html = await response.text();
|
const html = await response.text();
|
||||||
|
|
||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
let rssLink = $('link[type="application/rss+xml"]').attr('href');
|
let rssLink = $('link[type="application/rss+xml"]').attr("href");
|
||||||
if (!rssLink) {
|
if (!rssLink) {
|
||||||
rssLink = $('link[type="application/atom+xml"]').attr('href');
|
rssLink = $('link[type="application/atom+xml"]').attr("href");
|
||||||
if (!rssLink) {
|
if (!rssLink) {
|
||||||
rssLink = $('link[type="application/rdf+xml"]').attr('href');
|
rssLink = $('link[type="application/rdf+xml"]').attr("href");
|
||||||
if (!rssLink) {
|
if (!rssLink) {
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
@@ -243,25 +243,26 @@ const skipEntry = (
|
|||||||
|
|
||||||
if (
|
if (
|
||||||
!entry.title?.value ||
|
!entry.title?.value ||
|
||||||
(entry.links.length === 0 || !entry.links[0].href) ||
|
entry.links.length === 0 ||
|
||||||
(!entry.published && !entry.updated && !entry['dc:date'])
|
!entry.links[0].href ||
|
||||||
|
(!entry.published && !entry.updated && !entry["dc:date"])
|
||||||
) {
|
) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (
|
if (
|
||||||
entry.published &&
|
entry.published &&
|
||||||
Math.floor(entry.published.getTime() / 1000) <= (sourceUpdatedAt - 10)
|
Math.floor(entry.published.getTime() / 1000) <= sourceUpdatedAt - 10
|
||||||
) {
|
) {
|
||||||
return true;
|
return true;
|
||||||
} else if (
|
} else if (
|
||||||
entry.updated &&
|
entry.updated &&
|
||||||
Math.floor(entry.updated.getTime() / 1000) <= (sourceUpdatedAt - 10)
|
Math.floor(entry.updated.getTime() / 1000) <= sourceUpdatedAt - 10
|
||||||
) {
|
) {
|
||||||
return true;
|
return true;
|
||||||
} else if (
|
} else if (
|
||||||
entry['dc:date'] &&
|
entry["dc:date"] &&
|
||||||
getDCDateTimestamp(entry['dc:date']) <= (sourceUpdatedAt - 10)
|
getDCDateTimestamp(entry["dc:date"]) <= sourceUpdatedAt - 10
|
||||||
) {
|
) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -309,21 +310,32 @@ const generateItemId = async (
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* `getItemDescription` returns the description of an item based on the provided
|
* `getItemDescription` returns the description of an item based on the provided
|
||||||
* description and content. In the first step we try to use the description of
|
* description and content. In the first step we try to use the content of the
|
||||||
* the items as our description. If that is not available, we try to use the
|
* item as our description. If the item doesn't have a content field we will
|
||||||
* content. If that is not available, we return undefined. We also remove all
|
* try to use the description field. If the item contains a content and
|
||||||
* HTML tags from the description and content before returning it.
|
* description field we will use the content field, except for cases where the
|
||||||
|
* the text within the description field is longer.
|
||||||
|
*
|
||||||
|
* NOTE: The order was changed from "description | content" to
|
||||||
|
* "content | description", because it is more commmon that the full text is
|
||||||
|
* within the content field when a item contains both fields. The fallback of
|
||||||
|
* comparison of the length should be good enough as fallback.
|
||||||
*/
|
*/
|
||||||
const getItemDescription = (entry: FeedEntry): string | undefined => {
|
const getItemDescription = (entry: FeedEntry): string | undefined => {
|
||||||
if (entry.description?.value) {
|
let content = undefined;
|
||||||
return unescape(entry.description?.value);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (entry.content?.value) {
|
if (entry.content?.value) {
|
||||||
return unescape(entry.content?.value);
|
content = unescape(entry.content?.value);
|
||||||
}
|
}
|
||||||
|
|
||||||
return undefined;
|
if (entry.description?.value) {
|
||||||
|
const description = unescape(entry.description?.value);
|
||||||
|
if (!content || description.length > content?.length) {
|
||||||
|
content = description;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return content;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -336,12 +348,14 @@ const getItemDescription = (entry: FeedEntry): string | undefined => {
|
|||||||
* for the media field.
|
* for the media field.
|
||||||
*/
|
*/
|
||||||
const getMedia = (entry: FeedEntry): string | undefined => {
|
const getMedia = (entry: FeedEntry): string | undefined => {
|
||||||
if (entry['media:content'] && entry['media:content'].length > 0) {
|
if (entry["media:content"] && entry["media:content"].length > 0) {
|
||||||
for (const media of entry['media:content']) {
|
for (const media of entry["media:content"]) {
|
||||||
if (
|
if (
|
||||||
media.medium && media.medium === 'image' && media.url &&
|
media.medium &&
|
||||||
(media.url.startsWith('https://') || media.url.startsWith('http://')) &&
|
media.medium === "image" &&
|
||||||
!media.url.endsWith('.svg')
|
media.url &&
|
||||||
|
(media.url.startsWith("https://") || media.url.startsWith("http://")) &&
|
||||||
|
!media.url.endsWith(".svg")
|
||||||
) {
|
) {
|
||||||
return media.url;
|
return media.url;
|
||||||
}
|
}
|
||||||
@@ -349,23 +363,25 @@ const getMedia = (entry: FeedEntry): string | undefined => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (
|
if (
|
||||||
entry['media:thumbnails'] && entry['media:thumbnails'].url &&
|
entry["media:thumbnails"] &&
|
||||||
(entry['media:thumbnails'].url.startsWith('https://') ||
|
entry["media:thumbnails"].url &&
|
||||||
entry['media:thumbnails'].url.startsWith('http://'))
|
(entry["media:thumbnails"].url.startsWith("https://") ||
|
||||||
|
entry["media:thumbnails"].url.startsWith("http://"))
|
||||||
) {
|
) {
|
||||||
return entry['media:thumbnails'].url;
|
return entry["media:thumbnails"].url;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (entry['media:group'] && entry['media:group'].length > 0) {
|
if (entry["media:group"] && entry["media:group"].length > 0) {
|
||||||
for (const mediaGroup of entry['media:group']) {
|
for (const mediaGroup of entry["media:group"]) {
|
||||||
if (mediaGroup['media:content']) {
|
if (mediaGroup["media:content"]) {
|
||||||
for (const mediaContent of mediaGroup['media:content']) {
|
for (const mediaContent of mediaGroup["media:content"]) {
|
||||||
if (
|
if (
|
||||||
mediaContent.medium && mediaContent.medium === 'image' &&
|
mediaContent.medium &&
|
||||||
|
mediaContent.medium === "image" &&
|
||||||
mediaContent.url &&
|
mediaContent.url &&
|
||||||
(mediaContent.url.startsWith('https://') ||
|
(mediaContent.url.startsWith("https://") ||
|
||||||
mediaContent.url.startsWith('http://')) &&
|
mediaContent.url.startsWith("http://")) &&
|
||||||
!mediaContent.url.endsWith('.svg')
|
!mediaContent.url.endsWith(".svg")
|
||||||
) {
|
) {
|
||||||
return mediaContent.url;
|
return mediaContent.url;
|
||||||
}
|
}
|
||||||
@@ -377,11 +393,12 @@ const getMedia = (entry: FeedEntry): string | undefined => {
|
|||||||
if (entry.attachments && entry.attachments.length > 0) {
|
if (entry.attachments && entry.attachments.length > 0) {
|
||||||
for (const attachment of entry.attachments) {
|
for (const attachment of entry.attachments) {
|
||||||
if (
|
if (
|
||||||
attachment.mimeType && attachment.mimeType.startsWith('image/') &&
|
attachment.mimeType &&
|
||||||
|
attachment.mimeType.startsWith("image/") &&
|
||||||
attachment.url &&
|
attachment.url &&
|
||||||
(attachment.url.startsWith('https://') ||
|
(attachment.url.startsWith("https://") ||
|
||||||
attachment.url.startsWith('http://')) &&
|
attachment.url.startsWith("http://")) &&
|
||||||
!attachment.url.endsWith('.svg')
|
!attachment.url.endsWith(".svg")
|
||||||
) {
|
) {
|
||||||
return attachment.url;
|
return attachment.url;
|
||||||
}
|
}
|
||||||
@@ -393,9 +410,10 @@ const getMedia = (entry: FeedEntry): string | undefined => {
|
|||||||
unescape(entry.description.value),
|
unescape(entry.description.value),
|
||||||
);
|
);
|
||||||
if (
|
if (
|
||||||
matches && matches.length == 2 &&
|
matches &&
|
||||||
(matches[1].startsWith('https://') || matches[1].startsWith('http://')) &&
|
matches.length == 2 &&
|
||||||
!matches[1].endsWith('.svg')
|
(matches[1].startsWith("https://") || matches[1].startsWith("http://")) &&
|
||||||
|
!matches[1].endsWith(".svg")
|
||||||
) {
|
) {
|
||||||
return matches[1];
|
return matches[1];
|
||||||
}
|
}
|
||||||
@@ -406,9 +424,10 @@ const getMedia = (entry: FeedEntry): string | undefined => {
|
|||||||
unescape(entry.content.value),
|
unescape(entry.content.value),
|
||||||
);
|
);
|
||||||
if (
|
if (
|
||||||
matches && matches.length == 2 &&
|
matches &&
|
||||||
(matches[1].startsWith('https://') || matches[1].startsWith('http://')) &&
|
matches.length == 2 &&
|
||||||
!matches[1].endsWith('.svg')
|
(matches[1].startsWith("https://") || matches[1].startsWith("http://")) &&
|
||||||
|
!matches[1].endsWith(".svg")
|
||||||
) {
|
) {
|
||||||
return matches[1];
|
return matches[1];
|
||||||
}
|
}
|
||||||
@@ -425,10 +444,11 @@ const getVideo = (entry: FeedEntry): string | undefined => {
|
|||||||
if (entry.attachments && entry.attachments.length > 0) {
|
if (entry.attachments && entry.attachments.length > 0) {
|
||||||
for (const attachment of entry.attachments) {
|
for (const attachment of entry.attachments) {
|
||||||
if (
|
if (
|
||||||
attachment.mimeType && attachment.mimeType.startsWith('video/') &&
|
attachment.mimeType &&
|
||||||
|
attachment.mimeType.startsWith("video/") &&
|
||||||
attachment.url &&
|
attachment.url &&
|
||||||
(attachment.url.startsWith('https://') ||
|
(attachment.url.startsWith("https://") ||
|
||||||
attachment.url.startsWith('http://'))
|
attachment.url.startsWith("http://"))
|
||||||
) {
|
) {
|
||||||
return attachment.url;
|
return attachment.url;
|
||||||
}
|
}
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user