Skip to content

Commit

Permalink
feat(route/apnews): Add support for sitemap (#16447)
Browse files Browse the repository at this point in the history
* feat(route/apnews): Add support for sitemap

* Update sitemap.ts

* Update sitemap.ts
  • Loading branch information
dzx-dzx authored Aug 17, 2024
1 parent dff8192 commit 5a5bce7
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 1 deletion.
98 changes: 98 additions & 0 deletions lib/routes/apnews/sitemap.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import { Route, ViewType } from '@/types';
import { fetchArticle } from './utils';
import ofetch from '@/utils/ofetch';
import { load } from 'cheerio';
import { parseDate } from '@/utils/parse-date';
import asyncPool from 'tiny-async-pool';
import timezone from '@/utils/timezone';
const HOME_PAGE = 'https://apnews.com';

export const route: Route = {
path: '/sitemap/:route',
categories: ['traditional-media'],
example: '/apnews/sitemap/ap-sitemap-latest',
view: ViewType.Articles,
parameters: {
route: {
description: 'Route for sitemap, excluding the `.xml` extension',
},
},
features: {
requireConfig: false,
requirePuppeteer: false,
antiCrawler: false,
supportBT: false,
supportPodcast: false,
supportScihub: false,
},
radar: [
{
source: ['apnews.com/'],
},
],
name: 'Sitemap',
maintainers: ['zoenglinghou', 'mjysci', 'TonyRL', 'dzx-dzx'],
handler,
};

async function handler(ctx) {
const route = ctx.req.param('route');
const url = `${HOME_PAGE}/${route}.xml`;
const response = await ofetch(url);
const $ = load(response);

const list = $('urlset url')
.toArray()
.map((e) => {
const LANGUAGE_MAP = new Map([
['eng', 'en'],
['spa', 'es'],
]);

const title = $(e)
.find(String.raw`news\:title`)
.text();
const pubDate = parseDate(
$(e)
.find(String.raw`news\:publication_date`)
.text()
);
const lastmod = timezone(parseDate($(e).find(`lastmod`).text()), -4);
const language = LANGUAGE_MAP.get(
$(e)
.find(String.raw`news\:language`)
.text()
);
let res = { link: $(e).find('loc').text() };
if (title) {
res = Object.assign(res, { title });
}
if (pubDate.toString() !== 'Invalid Date') {
res = Object.assign(res, { pubDate });
}
if (language) {
res = Object.assign(res, { language });
}
if (lastmod.toString() !== 'Invalid Date') {
res = Object.assign(res, { lastmod });
}
return res;
})
.filter((e) => Boolean(e.link) && !new URL(e.link).pathname.split('/').includes('hub'))
.sort((a, b) => (a.pubDate && b.pubDate ? b.pubDate - a.pubDate : b.lastmod - a.lastmod))
.slice(0, ctx.req.query('limit') ? Number.parseInt(ctx.req.query('limit'), 10) : 20);

const items = await asyncPoolAll(20, list, (item) => fetchArticle(item));

return {
title: `AP News sitemap:${route}`,
item: items,
};
}
async function asyncPoolAll<IN, OUT>(poolLimit: number, array: readonly IN[], iteratorFn: (generator: IN) => Promise<OUT>) {
const results: Awaited<OUT[]> = [];
for await (const result of asyncPool(poolLimit, array, iteratorFn)) {
results.push(result);
}
return results;
}
3 changes: 2 additions & 1 deletion lib/routes/apnews/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@ export function fetchArticle(item) {
const rawLdjson = JSON.parse($('#link-ld-json').text());
let ldjson;
if (rawLdjson['@type'] === 'NewsArticle' || (Array.isArray(rawLdjson) && rawLdjson.some((e) => e['@type'] === 'NewsArticle'))) {
// Regular
// Regular(Articles, Videos)
ldjson = Array.isArray(rawLdjson) ? rawLdjson.find((e) => e['@type'] === 'NewsArticle') : rawLdjson;

$('div.Enhancement').remove();
const section = $("meta[property='article:section']").attr('content');
return {
title: ldjson.headline,
pubDate: parseDate(ldjson.datePublished),
updated: parseDate(ldjson.dateModified),
description: $('div.RichTextStoryBody').html() || $(':is(.VideoLead, .VideoPage-pageSubHeading)').html(),
Expand Down

0 comments on commit 5a5bce7

Please sign in to comment.