-
Notifications
You must be signed in to change notification settings - Fork 7.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* feat(route): cfr * fix: fix list selector * fix: add item link * Update lib/routes/cfr/utils.ts Co-authored-by: Tony <TonyRL@users.noreply.github.com> * fix * fix: add books-reports selector * feat: concurrent control * add antiCrawler tag ---------
- Loading branch information
1 parent
34a883c
commit f71073d
Showing
4 changed files
with
378 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import type { Data, Route } from '@/types'; | ||
import type { Context } from 'hono'; | ||
import ofetch from '@/utils/ofetch'; | ||
import { load } from 'cheerio'; | ||
import { asyncPoolAll, getDataItem } from './utils'; | ||
|
||
export const route: Route = { | ||
path: '/:category/:subCategory?', | ||
categories: ['traditional-media'], | ||
parameters: { | ||
category: 'category, find it in the URL', | ||
subCategory: 'sub-category, find it in the URL', | ||
}, | ||
example: '/cfr/asia', | ||
name: 'News', | ||
maintainers: ['KarasuShin'], | ||
handler, | ||
radar: [ | ||
{ | ||
source: ['www.cfr.org/:category', 'www.cfr.org/:category/:subCategory'], | ||
target: '/:category/:subCategory?', | ||
}, | ||
], | ||
features: { | ||
antiCrawler: true, | ||
}, | ||
}; | ||
|
||
async function handler(ctx: Context): Promise<Data> { | ||
const { category, subCategory } = ctx.req.param(); | ||
|
||
const origin = 'https://www.cfr.org'; | ||
let link = `${origin}/${category}`; | ||
if (subCategory) { | ||
link += `/${subCategory}`; | ||
} | ||
const res = await ofetch(link); | ||
|
||
const $ = load(res); | ||
|
||
const selectorMap: { | ||
[key: string]: string; | ||
} = { | ||
podcasts: '.episode-content__title a', | ||
blog: '.card-series__content-link', | ||
'books-reports': '.card-article__link', | ||
}; | ||
|
||
const listSelector = selectorMap[category] ?? '.card-article-large__link'; | ||
|
||
const items = await asyncPoolAll(5, $(listSelector).toArray(), async (item) => await getDataItem($(item).attr('href')!)); | ||
|
||
return { | ||
title: $('head title').text().replace(' | Council on Foreign Relations', ''), | ||
link, | ||
item: items, | ||
}; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
import { Namespace } from '@/types'; | ||
|
||
export const namespace: Namespace = { | ||
name: 'Council on Foreign Relations', | ||
url: 'www.cfr.org', | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
export interface LinkData { | ||
'@context': string; | ||
'@graph': { | ||
'@type': string; | ||
headline: string; | ||
name: string; | ||
about: string; | ||
description: string; | ||
image: { | ||
'@type': string; | ||
representativeOfPage: string; | ||
url: string; | ||
}; | ||
datePublished: string; | ||
dateModified: string; | ||
author: { | ||
'@type': string; | ||
name: string; | ||
url: string; | ||
}; | ||
}[]; | ||
} | ||
|
||
export interface VideoSetup { | ||
techOrder: string[]; | ||
sources: { | ||
type: string; | ||
src: string; | ||
}[]; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,284 @@ | ||
import { type Cheerio, type CheerioAPI, type Element, load } from 'cheerio'; | ||
import ofetch from '@/utils/ofetch'; | ||
import type { DataItem } from '@/types'; | ||
import { parseDate } from '@/utils/parse-date'; | ||
import cache from '@/utils/cache'; | ||
import type { LinkData, VideoSetup } from './types'; | ||
import asyncPool from 'tiny-async-pool'; | ||
|
||
export function getDataItem(href: string) { | ||
const origin = 'https://www.cfr.org'; | ||
const link = `${origin}${href}`; | ||
|
||
return cache.tryGet(link, async () => { | ||
const prefix = href?.split('/')[1]; | ||
const res = await ofetch(link); | ||
const $ = load(res); | ||
|
||
let dataItem: DataItem; | ||
|
||
switch (prefix) { | ||
case 'article': | ||
dataItem = parseArticle($); | ||
break; | ||
case 'blog': | ||
dataItem = parseBlog($); | ||
break; | ||
case 'book': | ||
dataItem = parseBook($); | ||
break; | ||
case 'conference-calls': | ||
dataItem = parseConferenceCalls($); | ||
break; | ||
case 'event': | ||
dataItem = parseEvent($); | ||
break; | ||
case 'backgrounder': | ||
dataItem = parseBackgrounder($); | ||
break; | ||
case 'podcasts': | ||
dataItem = parsePodcasts($); | ||
break; | ||
case 'task-force-report': | ||
dataItem = parseTaskForceReport($); | ||
break; | ||
case 'timeline': | ||
dataItem = parseTimeline($); | ||
break; | ||
case 'video': | ||
dataItem = parseVideo($); | ||
break; | ||
default: | ||
dataItem = parseDefault($); | ||
} | ||
|
||
return { | ||
...dataItem, | ||
link, | ||
}; | ||
}) as Promise<DataItem>; | ||
} | ||
|
||
function parseArticle($: CheerioAPI): DataItem { | ||
const linkData = parseLinkData($); | ||
let description = parseDescription($('.body-content'), $); | ||
const $articleHeader = $('.article-header__image'); | ||
if ($articleHeader.length) { | ||
description = `<figure>${$articleHeader.html()}</figure><br>${description}`; | ||
} | ||
return { | ||
title: linkData?.title ?? $('.article-header__title').text(), | ||
pubDate: linkData?.pubDate, | ||
description, | ||
}; | ||
} | ||
|
||
function parseBlog($: CheerioAPI): DataItem { | ||
const linkData = parseLinkData($); | ||
let description = parseDescription($('.body-content'), $); | ||
const figure = $('.article-header-blog__figure'); | ||
if (figure.length) { | ||
description = `<figure>${figure.html()}</figure><br>${description}`; | ||
} | ||
return { | ||
title: linkData?.title ?? $('.article-header-blog__title').text(), | ||
pubDate: linkData?.pubDate, | ||
description, | ||
}; | ||
} | ||
|
||
function parseBook($: CheerioAPI): DataItem { | ||
const linkData = parseLinkData($); | ||
let description = parseDescription($('.body-content'), $); | ||
const sectionTop = $('.article-header__section-top'); | ||
description = `${sectionTop.html()}<br>${description}`; | ||
|
||
return { | ||
title: linkData?.title ?? $('.article-header__title').text(), | ||
pubDate: linkData?.pubDate, | ||
description, | ||
}; | ||
} | ||
|
||
function parseConferenceCalls($: CheerioAPI): DataItem { | ||
const linkData = parseLinkData($); | ||
const description = parseDescription($('.podcast-body').last(), $); | ||
return { | ||
title: linkData?.title ?? $('head title').text(), | ||
pubDate: linkData?.pubDate, | ||
description, | ||
}; | ||
} | ||
|
||
function parseEvent($: CheerioAPI): DataItem { | ||
const linkData = parseLinkData($); | ||
let description = parseDescription($('.body-content'), $); | ||
const videoIfame = getVideoIframe($('.msp-event-video')); | ||
if (videoIfame) { | ||
description = `${videoIfame}<br>${description}`; | ||
} | ||
|
||
return { | ||
title: linkData?.title ?? $('.msp-event-header-past__title').text(), | ||
pubDate: linkData?.pubDate, | ||
description, | ||
}; | ||
} | ||
|
||
function parseBackgrounder($: CheerioAPI): DataItem { | ||
const linkData = parseLinkData($); | ||
let description = parseDescription($('.main-wrapper__article-body .body-content'), $); | ||
const summary = $('.main-wrapper__article-body .summary').html(); | ||
if (summary) { | ||
description = `${summary}<br>${description}`; | ||
} | ||
const figure = $('.article-header-backgrounder__figure'); | ||
if (figure.length) { | ||
description = `<figure>${figure.html()}</figure><br>${description}`; | ||
} | ||
|
||
return { | ||
title: linkData?.title ?? $('.article-header-backgrounder__title').text(), | ||
pubDate: linkData?.pubDate, | ||
description, | ||
}; | ||
} | ||
|
||
function parsePodcasts($: CheerioAPI): DataItem { | ||
const linkData = parseLinkData($); | ||
let description = $('.body-content').first().html() ?? ''; | ||
const audioSrc = $('#player-default').attr('src'); | ||
if (audioSrc) { | ||
description = `<audio controls src="${audioSrc}"></audio><br>${description}`; | ||
} | ||
return { | ||
title: linkData?.title ?? $('head title').text(), | ||
pubDate: linkData?.pubDate, | ||
description, | ||
enclosure_url: audioSrc, | ||
enclosure_type: 'audio/mpeg', | ||
}; | ||
} | ||
|
||
function parseTaskForceReport($: CheerioAPI): DataItem { | ||
const linkData = parseLinkData($); | ||
|
||
let description = ''; | ||
|
||
$('.main-content').each((_, ele) => { | ||
const $ele = $(ele); | ||
const content = $ele.find('.content_area').html() ?? ''; | ||
description += `${content}<br>`; | ||
}); | ||
|
||
return { | ||
title: linkData?.title ?? $('.hero__title').remove('.subtitle').text(), | ||
pubDate: linkData?.pubDate, | ||
description, | ||
}; | ||
} | ||
|
||
function parseTimeline($: CheerioAPI): DataItem { | ||
const linkData = parseLinkData($); | ||
|
||
const $description = $('.timeline-slides'); | ||
$description.find('.timeline-slide__shadow').remove(); | ||
$description.find('.field--image').each((_, ele) => { | ||
$(ele).replaceWith($(ele).find('img')); | ||
}); | ||
let description = $description.find('.timeline-intro__description').html() ?? ''; | ||
for (const item of $description.find('.timeline-slide__content').toArray()) { | ||
const $item = $(item); | ||
$item.find('.timeline-slide__dates-header').replaceWith('<h1>' + $item.find('.timeline-slide__dates-header').text() + '</h1>'); | ||
$item.find('.timeline-slide__dates').replaceWith('<h2>' + $item.find('.timeline-slide__dates').text() + '</h2>'); | ||
description += `<br>${$item.html()}`; | ||
} | ||
return { | ||
title: linkData?.title ?? $('.timeline-header__title').text(), | ||
pubDate: linkData?.pubDate, | ||
description, | ||
}; | ||
} | ||
|
||
function parseVideo($: CheerioAPI): DataItem { | ||
const linkData = parseLinkData($); | ||
let description = parseDescription($('.body-content'), $); | ||
const $articleHeader = $('.article-header__image'); | ||
const videoIfame = getVideoIframe($articleHeader); | ||
if (videoIfame) { | ||
description = `${videoIfame}<br>${description}`; | ||
} | ||
|
||
return { | ||
title: linkData?.title ?? $('.article-header__title').text(), | ||
pubDate: linkData?.pubDate, | ||
description, | ||
}; | ||
} | ||
|
||
function parseDefault($): DataItem { | ||
if ($('.body-content').length) { | ||
return parseArticle($); | ||
} | ||
const linkData = parseLinkData($); | ||
return { | ||
title: linkData?.title ?? $('head title').text(), | ||
pubDate: linkData?.pubDate, | ||
}; | ||
} | ||
|
||
function parseLinkData($: CheerioAPI) { | ||
try { | ||
const data = (<LinkData>JSON.parse($('script[type="application/ld+json"]').text()))['@graph'][0]; | ||
|
||
return { | ||
title: data.name, | ||
pubDate: parseDate(data.dateModified), | ||
}; | ||
} catch { | ||
// ignore | ||
} | ||
} | ||
|
||
function getVideoIframe($ele: Cheerio<Element>) { | ||
const setup = $ele.find('video').data('setup') as VideoSetup; | ||
if (setup) { | ||
const youtubeSource = setup.sources.find((i) => i.type === 'video/youtube'); | ||
if (youtubeSource) { | ||
const videoId = youtubeSource.src.match(/\?v=([^&]+)/)?.[1]; | ||
if (videoId) { | ||
return `<iframe src="https://www.youtube-nocookie.com/embed/${videoId}" width="640" height="360" frameborder="0" allowfullscreen></iframe>`; | ||
} | ||
} | ||
} | ||
} | ||
|
||
function parseDescription($description: Cheerio<Element>, $: CheerioAPI) { | ||
$description.find('.desktop-only').remove(); | ||
$description.find('.mobile-only').remove(); | ||
$description.find('.newsletter-tout').remove(); | ||
$description.find('.carousel-gallery').remove(); | ||
$description.find('svg').remove(); | ||
$description.find('.field--image').each((_, ele) => { | ||
$(ele).replaceWith($(ele).find('img')); | ||
}); | ||
$description.find('.video-embed').each((_, ele) => { | ||
const $ele = $(ele); | ||
const videoIframe = getVideoIframe($ele); | ||
if (videoIframe) { | ||
$ele.replaceWith(videoIframe); | ||
} | ||
}); | ||
|
||
const description = $description.html() ?? ''; | ||
|
||
return description; | ||
} | ||
|
||
export async function asyncPoolAll<IN, OUT>(poolLimit: number, array: readonly IN[], iteratorFn: (generator: IN) => Promise<OUT>) { | ||
const results: Awaited<OUT[]> = []; | ||
for await (const result of asyncPool(poolLimit, array, iteratorFn)) { | ||
results.push(result); | ||
} | ||
return results; | ||
} |