-
Notifications
You must be signed in to change notification settings - Fork 7.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* fix(route): scmp full text * fix: null node check
- Loading branch information
Showing
2 changed files
with
121 additions
and
85 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,114 +1,90 @@ | ||
const parser = require('@/utils/rss-parser'); | ||
const cheerio = require('cheerio'); | ||
const got = require('@/utils/got'); | ||
const { parseDate } = require('@/utils/parse-date'); | ||
const chromeMobileUserAgent = require('@/utils/rand-user-agent')({ browser: 'chrome', os: 'android', device: 'mobile' }); | ||
const { renderHTML } = require('./utils'); | ||
|
||
module.exports = async (ctx) => { | ||
const categoryId = ctx.params.category_id; | ||
const rssUrl = `https://www.scmp.com/rss/${categoryId}/feed`; | ||
const feed = await parser.parseURL(rssUrl); | ||
const { data: response } = await got(rssUrl); | ||
const $ = cheerio.load(response, { | ||
xmlMode: true, | ||
}); | ||
|
||
const list = $('item') | ||
.toArray() | ||
.map((item) => { | ||
item = $(item); | ||
const enclosure = item.find('enclosure').first(); | ||
const mediaContent = item.find('media\\:content').toArray()[0]; | ||
const thumbnail = item.find('media\\:thumbnail').toArray()[0]; | ||
return { | ||
title: item.find('title').text(), | ||
description: item.find('description').text(), | ||
link: item.find('link').text().split('?utm_source')[0], | ||
author: item.find('author').text(), | ||
pubDate: parseDate(item.find('pubDate').text()), | ||
enclosure_url: enclosure?.attr('url'), | ||
enclosure_length: enclosure?.attr('length'), | ||
enclosure_type: enclosure?.attr('type'), | ||
media: { | ||
content: Object.keys(mediaContent.attribs).reduce((data, key) => { | ||
data[key] = mediaContent.attribs[key]; | ||
return data; | ||
}, {}), | ||
thumbnail: thumbnail?.attribs | ||
? Object.keys(thumbnail.attribs).reduce((data, attr) => { | ||
data[attr] = thumbnail.attribs[attr]; | ||
return data; | ||
}, {}) | ||
: undefined, | ||
}, | ||
}; | ||
}); | ||
|
||
const items = await Promise.all( | ||
feed.items.map((item) => | ||
list.map((item) => | ||
ctx.cache.tryGet(item.link, async () => { | ||
// Fetch the AMP version | ||
const url = item.link.replace(/^https:\/\/www\.scmp\.com/, 'https://amp.scmp.com'); | ||
const response = await got(url, { | ||
headers: { | ||
'User-Agent': chromeMobileUserAgent, | ||
}, | ||
}); | ||
const html = response.data; | ||
const $ = cheerio.load(html); | ||
const content = $('div.article-body.clearfix'); | ||
|
||
// Cover | ||
const cover = $('.article-images > amp-carousel > .i-amphtml-slides-container >.i-amphtml-slide-item > amp-img > img'); | ||
const { data: response, url } = await got(item.link); | ||
|
||
if (cover.length > 0) { | ||
$(`<img src=${cover[0].attribs.content}>`).insertBefore(content[0].childNodes[0]); | ||
$(cover).remove(); | ||
if (new URL(url).hostname !== 'www.scmp.com') { | ||
// e.g., https://multimedia.scmp.com/ | ||
return item; | ||
} | ||
|
||
// Summary | ||
const summary = $('div.article-header__subhead > ul'); | ||
|
||
// Metadata (categories & updatedAt) | ||
const updatedAt = $('meta[itemprop="dateModified"]').attr('content'); | ||
const publishedAt = item.pubDate || $('meta[itemprop="datePublished"]').attr('content'); | ||
|
||
const categories = $('meta[name="keywords"]') | ||
.attr('content') | ||
.split(',') | ||
.map((c) => c.trim()); | ||
const $ = cheerio.load(response); | ||
|
||
// Images | ||
content.find('amp-img').each((i, e) => { | ||
const img = $(`<img width="${e.attribs.width}" height="${e.attribs.height}" src="${e.attribs.src}" alt="${e.attribs.alt}">`); | ||
const nextData = JSON.parse($('script#__NEXT_DATA__').text()); | ||
const { article } = nextData.props.pageProps.payload.data; | ||
|
||
// Caption follows, no need to handle caption | ||
$(img).insertBefore(e); | ||
$(e).remove(); | ||
}); | ||
// item.nextData = article; | ||
|
||
// iframes (youtube videos and interactive elements) | ||
content.find('amp-iframe').each((i, e) => { | ||
if ($(e).find('iframe').length > 0) { | ||
const iframe = $(e).find('iframe')[0]; | ||
$(iframe).insertBefore(e); | ||
$(e).remove(); | ||
} | ||
}); | ||
item.summary = renderHTML(article.summary.json); | ||
item.description = renderHTML(article.subHeadline.json) + renderHTML(article.images.find((i) => i.type === 'leading')) + renderHTML(article.body.json); | ||
item.updated = parseDate(article.updatedDate, 'x'); | ||
item.category = [...new Set([...article.topics.map((t) => t.name), ...article.sections.flatMap((t) => t.value.map((v) => v.name)), ...article.keywords.map((k) => k?.split(', '))])]; | ||
|
||
content.find('div.video-wrapper > amp-iframe').each((i, e) => { | ||
const iframe = $(`<iframe width="${e.attribs.width}" height="${e.attribs.height}" src="${e.attribs.src}">`); | ||
$(iframe).insertBefore(e); | ||
$(e).remove(); | ||
}); | ||
// N.B. gallery in article is not rendered | ||
// e.g., { type: 'div', attribs: { class: 'scmp-photo-gallery', 'data-gallery-nid': '3239409' }} | ||
// from https://www.scmp.com/news/china/politics/article/3239355/li-keqiang-former-premier-china-dead | ||
|
||
// Remove unwanted DOMs | ||
const unwanted_element_selectors = [ | ||
'[class*="-advert"]', | ||
'.social-share', | ||
'.article-body-after', | ||
'scmp-chinaR2-early-text', | ||
'.newsletter-widget-wrapper', | ||
'[id*="-tracker"]', | ||
'[class^="advert-"]', | ||
'amp-list', | ||
'.more-on-this', | ||
]; | ||
unwanted_element_selectors.forEach((selector) => { | ||
content.find(selector).each((i, e) => { | ||
$(e).remove(); | ||
}); | ||
}); | ||
|
||
return { | ||
title: item.title, | ||
id: item.guid, | ||
pubDate: parseDate(publishedAt), | ||
updated: parseDate(updatedAt), | ||
author: item.creator, | ||
link: item.link, | ||
summary: summary.html(), | ||
description: content.html(), | ||
category: categories, | ||
icon: 'https://assets.i-scmp.com/static/img/icons/scmp-icon-256x256.png', | ||
logo: 'https://customerservice.scmp.com/img/logo_scmp@2x.png', | ||
}; | ||
return item; | ||
}) | ||
) | ||
); | ||
|
||
ctx.state.json = { | ||
items, | ||
}; | ||
|
||
ctx.state.data = { | ||
title: feed.title, | ||
link: feed.link, | ||
description: feed.description, | ||
title: $('channel > title').text(), | ||
link: $('channel > link').text(), | ||
description: $('channel > description').text(), | ||
item: items, | ||
language: 'en-hk', | ||
icon: 'https://assets.i-scmp.com/static/img/icons/scmp-icon-256x256.png', | ||
logo: 'https://customerservice.scmp.com/img/logo_scmp@2x.png', | ||
image: $('channel > image > url').text(), | ||
}; | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
const renderHTML = (node) => { | ||
if (!node) { | ||
return ''; | ||
} | ||
if (Array.isArray(node)) { | ||
return node.map((n) => renderHTML(n)).join(''); | ||
} | ||
|
||
switch (node.type) { | ||
case 'a': | ||
return `<a ${Object.keys(node.attribs) | ||
.map((key) => `${key}="${node.attribs[key]}"`) | ||
.join(' ')}>${renderHTML(node.children)}</a>`; | ||
case 'div': | ||
return `<div ${ | ||
node.attribs | ||
? Object.keys(node.attribs) | ||
.map((key) => `${key}="${node.attribs[key]}"`) | ||
.join(' ') | ||
: '' | ||
}>${renderHTML(node.children)}</div>`; | ||
case 'blockquote-quote': | ||
return `<blockquote>${renderHTML(node.children)}</blockquote>`; | ||
case 'iframe': | ||
return `<iframe ${Object.keys(node.attribs) | ||
.map((key) => `${key}="${node.attribs[key]}"`) | ||
.join(' ')}></iframe>`; | ||
case 'leading': | ||
case 'img': | ||
return `<figure><img ${ | ||
node.attribs | ||
? Object.keys(node.attribs) | ||
.map((key) => `${key}="${node.attribs[key]}"`) | ||
.join(' ') | ||
: `url="${node.url}"` // for leading | ||
}><figcaption>${node.attribs?.title ?? node.title}</figcaption></figure>`; | ||
case 'em': | ||
case 'h3': | ||
case 'li': | ||
case 'ol': | ||
case 'ul': | ||
case 'p': | ||
case 'strong': | ||
case 'u': | ||
return `<${node.type}>${renderHTML(node.children)}</${node.type}>`; | ||
case 'text': | ||
return node.data; | ||
case 'script': | ||
case 'inline-ad-slot': | ||
case 'inline-widget': | ||
case 'track-viewed-percentage': | ||
return ''; | ||
default: | ||
return `Unhandled type: ${node.type} ${JSON.stringify(node)}`; | ||
} | ||
}; | ||
|
||
module.exports = { | ||
renderHTML, | ||
}; |