From 5a5bce73fcb42174532ab79fe68bc7db0a577862 Mon Sep 17 00:00:00 2001 From: Andvari <31068367+dzx-dzx@users.noreply.github.com> Date: Sat, 17 Aug 2024 23:12:36 +0800 Subject: [PATCH] feat(route/apnews): Add support for sitemap (#16447) * feat(route/apnews): Add support for sitemap * Update sitemap.ts * Update sitemap.ts --- lib/routes/apnews/sitemap.ts | 98 ++++++++++++++++++++++++++++++++++++ lib/routes/apnews/utils.ts | 3 +- 2 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 lib/routes/apnews/sitemap.ts diff --git a/lib/routes/apnews/sitemap.ts b/lib/routes/apnews/sitemap.ts new file mode 100644 index 00000000000000..4102d5e1076bc7 --- /dev/null +++ b/lib/routes/apnews/sitemap.ts @@ -0,0 +1,98 @@ +import { Route, ViewType } from '@/types'; +import { fetchArticle } from './utils'; +import ofetch from '@/utils/ofetch'; +import { load } from 'cheerio'; +import { parseDate } from '@/utils/parse-date'; +import asyncPool from 'tiny-async-pool'; +import timezone from '@/utils/timezone'; +const HOME_PAGE = 'https://apnews.com'; + +export const route: Route = { + path: '/sitemap/:route', + categories: ['traditional-media'], + example: '/apnews/sitemap/ap-sitemap-latest', + view: ViewType.Articles, + parameters: { + route: { + description: 'Route for sitemap, excluding the `.xml` extension', + }, + }, + features: { + requireConfig: false, + requirePuppeteer: false, + antiCrawler: false, + supportBT: false, + supportPodcast: false, + supportScihub: false, + }, + radar: [ + { + source: ['apnews.com/'], + }, + ], + name: 'Sitemap', + maintainers: ['zoenglinghou', 'mjysci', 'TonyRL', 'dzx-dzx'], + handler, +}; + +async function handler(ctx) { + const route = ctx.req.param('route'); + const url = `${HOME_PAGE}/${route}.xml`; + const response = await ofetch(url); + const $ = load(response); + + const list = $('urlset url') + .toArray() + .map((e) => { + const LANGUAGE_MAP = new Map([ + ['eng', 'en'], + ['spa', 'es'], + ]); + + const title = $(e) + .find(String.raw`news\:title`) + .text(); + const pubDate = parseDate( + $(e) + .find(String.raw`news\:publication_date`) + .text() + ); + const lastmod = timezone(parseDate($(e).find(`lastmod`).text()), -4); + const language = LANGUAGE_MAP.get( + $(e) + .find(String.raw`news\:language`) + .text() + ); + let res = { link: $(e).find('loc').text() }; + if (title) { + res = Object.assign(res, { title }); + } + if (pubDate.toString() !== 'Invalid Date') { + res = Object.assign(res, { pubDate }); + } + if (language) { + res = Object.assign(res, { language }); + } + if (lastmod.toString() !== 'Invalid Date') { + res = Object.assign(res, { lastmod }); + } + return res; + }) + .filter((e) => Boolean(e.link) && !new URL(e.link).pathname.split('/').includes('hub')) + .sort((a, b) => (a.pubDate && b.pubDate ? b.pubDate - a.pubDate : b.lastmod - a.lastmod)) + .slice(0, ctx.req.query('limit') ? Number.parseInt(ctx.req.query('limit'), 10) : 20); + + const items = await asyncPoolAll(20, list, (item) => fetchArticle(item)); + + return { + title: `AP News sitemap:${route}`, + item: items, + }; +} +async function asyncPoolAll(poolLimit: number, array: readonly IN[], iteratorFn: (generator: IN) => Promise) { + const results: Awaited = []; + for await (const result of asyncPool(poolLimit, array, iteratorFn)) { + results.push(result); + } + return results; +} diff --git a/lib/routes/apnews/utils.ts b/lib/routes/apnews/utils.ts index 035c184ad7a257..f1b0c1c728480b 100644 --- a/lib/routes/apnews/utils.ts +++ b/lib/routes/apnews/utils.ts @@ -17,12 +17,13 @@ export function fetchArticle(item) { const rawLdjson = JSON.parse($('#link-ld-json').text()); let ldjson; if (rawLdjson['@type'] === 'NewsArticle' || (Array.isArray(rawLdjson) && rawLdjson.some((e) => e['@type'] === 'NewsArticle'))) { - // Regular + // Regular(Articles, Videos) ldjson = Array.isArray(rawLdjson) ? rawLdjson.find((e) => e['@type'] === 'NewsArticle') : rawLdjson; $('div.Enhancement').remove(); const section = $("meta[property='article:section']").attr('content'); return { + title: ldjson.headline, pubDate: parseDate(ldjson.datePublished), updated: parseDate(ldjson.dateModified), description: $('div.RichTextStoryBody').html() || $(':is(.VideoLead, .VideoPage-pageSubHeading)').html(),