Skip to content

Commit

Permalink
feat(route): cfr (#15682)
Browse files Browse the repository at this point in the history
* feat(route): cfr

* fix: fix list selector

* fix: add item link

* Update lib/routes/cfr/utils.ts

Co-authored-by: Tony <TonyRL@users.noreply.github.com>

* fix

* fix: add books-reports selector

* feat: concurrent control

* add antiCrawler tag

---------
  • Loading branch information
KarasuShin authored May 24, 2024
1 parent 34a883c commit f71073d
Show file tree
Hide file tree
Showing 4 changed files with 378 additions and 0 deletions.
58 changes: 58 additions & 0 deletions lib/routes/cfr/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import type { Data, Route } from '@/types';
import type { Context } from 'hono';
import ofetch from '@/utils/ofetch';
import { load } from 'cheerio';
import { asyncPoolAll, getDataItem } from './utils';

export const route: Route = {
path: '/:category/:subCategory?',
categories: ['traditional-media'],
parameters: {
category: 'category, find it in the URL',
subCategory: 'sub-category, find it in the URL',
},
example: '/cfr/asia',
name: 'News',
maintainers: ['KarasuShin'],
handler,
radar: [
{
source: ['www.cfr.org/:category', 'www.cfr.org/:category/:subCategory'],
target: '/:category/:subCategory?',
},
],
features: {
antiCrawler: true,
},
};

async function handler(ctx: Context): Promise<Data> {
const { category, subCategory } = ctx.req.param();

const origin = 'https://www.cfr.org';
let link = `${origin}/${category}`;
if (subCategory) {
link += `/${subCategory}`;
}
const res = await ofetch(link);

const $ = load(res);

const selectorMap: {
[key: string]: string;
} = {
podcasts: '.episode-content__title a',
blog: '.card-series__content-link',
'books-reports': '.card-article__link',
};

const listSelector = selectorMap[category] ?? '.card-article-large__link';

const items = await asyncPoolAll(5, $(listSelector).toArray(), async (item) => await getDataItem($(item).attr('href')!));

return {
title: $('head title').text().replace(' | Council on Foreign Relations', ''),
link,
item: items,
};
}
6 changes: 6 additions & 0 deletions lib/routes/cfr/namespace.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import { Namespace } from '@/types';

export const namespace: Namespace = {
name: 'Council on Foreign Relations',
url: 'www.cfr.org',
};
30 changes: 30 additions & 0 deletions lib/routes/cfr/types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
export interface LinkData {
'@context': string;
'@graph': {
'@type': string;
headline: string;
name: string;
about: string;
description: string;
image: {
'@type': string;
representativeOfPage: string;
url: string;
};
datePublished: string;
dateModified: string;
author: {
'@type': string;
name: string;
url: string;
};
}[];
}

export interface VideoSetup {
techOrder: string[];
sources: {
type: string;
src: string;
}[];
}
284 changes: 284 additions & 0 deletions lib/routes/cfr/utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,284 @@
import { type Cheerio, type CheerioAPI, type Element, load } from 'cheerio';
import ofetch from '@/utils/ofetch';
import type { DataItem } from '@/types';
import { parseDate } from '@/utils/parse-date';
import cache from '@/utils/cache';
import type { LinkData, VideoSetup } from './types';
import asyncPool from 'tiny-async-pool';

export function getDataItem(href: string) {
const origin = 'https://www.cfr.org';
const link = `${origin}${href}`;

return cache.tryGet(link, async () => {
const prefix = href?.split('/')[1];
const res = await ofetch(link);
const $ = load(res);

let dataItem: DataItem;

switch (prefix) {
case 'article':
dataItem = parseArticle($);
break;
case 'blog':
dataItem = parseBlog($);
break;
case 'book':
dataItem = parseBook($);
break;
case 'conference-calls':
dataItem = parseConferenceCalls($);
break;
case 'event':
dataItem = parseEvent($);
break;
case 'backgrounder':
dataItem = parseBackgrounder($);
break;
case 'podcasts':
dataItem = parsePodcasts($);
break;
case 'task-force-report':
dataItem = parseTaskForceReport($);
break;
case 'timeline':
dataItem = parseTimeline($);
break;
case 'video':
dataItem = parseVideo($);
break;
default:
dataItem = parseDefault($);
}

return {
...dataItem,
link,
};
}) as Promise<DataItem>;
}

function parseArticle($: CheerioAPI): DataItem {
const linkData = parseLinkData($);
let description = parseDescription($('.body-content'), $);
const $articleHeader = $('.article-header__image');
if ($articleHeader.length) {
description = `<figure>${$articleHeader.html()}</figure><br>${description}`;
}
return {
title: linkData?.title ?? $('.article-header__title').text(),
pubDate: linkData?.pubDate,
description,
};
}

function parseBlog($: CheerioAPI): DataItem {
const linkData = parseLinkData($);
let description = parseDescription($('.body-content'), $);
const figure = $('.article-header-blog__figure');
if (figure.length) {
description = `<figure>${figure.html()}</figure><br>${description}`;
}
return {
title: linkData?.title ?? $('.article-header-blog__title').text(),
pubDate: linkData?.pubDate,
description,
};
}

function parseBook($: CheerioAPI): DataItem {
const linkData = parseLinkData($);
let description = parseDescription($('.body-content'), $);
const sectionTop = $('.article-header__section-top');
description = `${sectionTop.html()}<br>${description}`;

return {
title: linkData?.title ?? $('.article-header__title').text(),
pubDate: linkData?.pubDate,
description,
};
}

function parseConferenceCalls($: CheerioAPI): DataItem {
const linkData = parseLinkData($);
const description = parseDescription($('.podcast-body').last(), $);
return {
title: linkData?.title ?? $('head title').text(),
pubDate: linkData?.pubDate,
description,
};
}

function parseEvent($: CheerioAPI): DataItem {
const linkData = parseLinkData($);
let description = parseDescription($('.body-content'), $);
const videoIfame = getVideoIframe($('.msp-event-video'));
if (videoIfame) {
description = `${videoIfame}<br>${description}`;
}

return {
title: linkData?.title ?? $('.msp-event-header-past__title').text(),
pubDate: linkData?.pubDate,
description,
};
}

function parseBackgrounder($: CheerioAPI): DataItem {
const linkData = parseLinkData($);
let description = parseDescription($('.main-wrapper__article-body .body-content'), $);
const summary = $('.main-wrapper__article-body .summary').html();
if (summary) {
description = `${summary}<br>${description}`;
}
const figure = $('.article-header-backgrounder__figure');
if (figure.length) {
description = `<figure>${figure.html()}</figure><br>${description}`;
}

return {
title: linkData?.title ?? $('.article-header-backgrounder__title').text(),
pubDate: linkData?.pubDate,
description,
};
}

function parsePodcasts($: CheerioAPI): DataItem {
const linkData = parseLinkData($);
let description = $('.body-content').first().html() ?? '';
const audioSrc = $('#player-default').attr('src');
if (audioSrc) {
description = `<audio controls src="${audioSrc}"></audio><br>${description}`;
}
return {
title: linkData?.title ?? $('head title').text(),
pubDate: linkData?.pubDate,
description,
enclosure_url: audioSrc,
enclosure_type: 'audio/mpeg',
};
}

function parseTaskForceReport($: CheerioAPI): DataItem {
const linkData = parseLinkData($);

let description = '';

$('.main-content').each((_, ele) => {
const $ele = $(ele);
const content = $ele.find('.content_area').html() ?? '';
description += `${content}<br>`;
});

return {
title: linkData?.title ?? $('.hero__title').remove('.subtitle').text(),
pubDate: linkData?.pubDate,
description,
};
}

function parseTimeline($: CheerioAPI): DataItem {
const linkData = parseLinkData($);

const $description = $('.timeline-slides');
$description.find('.timeline-slide__shadow').remove();
$description.find('.field--image').each((_, ele) => {
$(ele).replaceWith($(ele).find('img'));
});
let description = $description.find('.timeline-intro__description').html() ?? '';
for (const item of $description.find('.timeline-slide__content').toArray()) {
const $item = $(item);
$item.find('.timeline-slide__dates-header').replaceWith('<h1>' + $item.find('.timeline-slide__dates-header').text() + '</h1>');
$item.find('.timeline-slide__dates').replaceWith('<h2>' + $item.find('.timeline-slide__dates').text() + '</h2>');
description += `<br>${$item.html()}`;
}
return {
title: linkData?.title ?? $('.timeline-header__title').text(),
pubDate: linkData?.pubDate,
description,
};
}

function parseVideo($: CheerioAPI): DataItem {
const linkData = parseLinkData($);
let description = parseDescription($('.body-content'), $);
const $articleHeader = $('.article-header__image');
const videoIfame = getVideoIframe($articleHeader);
if (videoIfame) {
description = `${videoIfame}<br>${description}`;
}

return {
title: linkData?.title ?? $('.article-header__title').text(),
pubDate: linkData?.pubDate,
description,
};
}

function parseDefault($): DataItem {
if ($('.body-content').length) {
return parseArticle($);
}
const linkData = parseLinkData($);
return {
title: linkData?.title ?? $('head title').text(),
pubDate: linkData?.pubDate,
};
}

function parseLinkData($: CheerioAPI) {
try {
const data = (<LinkData>JSON.parse($('script[type="application/ld+json"]').text()))['@graph'][0];

return {
title: data.name,
pubDate: parseDate(data.dateModified),
};
} catch {
// ignore
}
}

function getVideoIframe($ele: Cheerio<Element>) {
const setup = $ele.find('video').data('setup') as VideoSetup;
if (setup) {
const youtubeSource = setup.sources.find((i) => i.type === 'video/youtube');
if (youtubeSource) {
const videoId = youtubeSource.src.match(/\?v=([^&]+)/)?.[1];
if (videoId) {
return `<iframe src="https://www.youtube-nocookie.com/embed/${videoId}" width="640" height="360" frameborder="0" allowfullscreen></iframe>`;
}
}
}
}

function parseDescription($description: Cheerio<Element>, $: CheerioAPI) {
$description.find('.desktop-only').remove();
$description.find('.mobile-only').remove();
$description.find('.newsletter-tout').remove();
$description.find('.carousel-gallery').remove();
$description.find('svg').remove();
$description.find('.field--image').each((_, ele) => {
$(ele).replaceWith($(ele).find('img'));
});
$description.find('.video-embed').each((_, ele) => {
const $ele = $(ele);
const videoIframe = getVideoIframe($ele);
if (videoIframe) {
$ele.replaceWith(videoIframe);
}
});

const description = $description.html() ?? '';

return description;
}

export async function asyncPoolAll<IN, OUT>(poolLimit: number, array: readonly IN[], iteratorFn: (generator: IN) => Promise<OUT>) {
const results: Awaited<OUT[]> = [];
for await (const result of asyncPool(poolLimit, array, iteratorFn)) {
results.push(result);
}
return results;
}

0 comments on commit f71073d

Please sign in to comment.