Skip to content

Commit

Permalink
fix(route): scmp full text (#13667)
Browse files Browse the repository at this point in the history
* fix(route): scmp full text

* fix: null node check
  • Loading branch information
TonyRL authored Oct 31, 2023
1 parent 8143ea1 commit 51ac5f8
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 85 deletions.
146 changes: 61 additions & 85 deletions lib/v2/scmp/index.js
Original file line number Diff line number Diff line change
@@ -1,114 +1,90 @@
const parser = require('@/utils/rss-parser');
const cheerio = require('cheerio');
const got = require('@/utils/got');
const { parseDate } = require('@/utils/parse-date');
const chromeMobileUserAgent = require('@/utils/rand-user-agent')({ browser: 'chrome', os: 'android', device: 'mobile' });
const { renderHTML } = require('./utils');

module.exports = async (ctx) => {
const categoryId = ctx.params.category_id;
const rssUrl = `https://www.scmp.com/rss/${categoryId}/feed`;
const feed = await parser.parseURL(rssUrl);
const { data: response } = await got(rssUrl);
const $ = cheerio.load(response, {
xmlMode: true,
});

const list = $('item')
.toArray()
.map((item) => {
item = $(item);
const enclosure = item.find('enclosure').first();
const mediaContent = item.find('media\\:content').toArray()[0];
const thumbnail = item.find('media\\:thumbnail').toArray()[0];
return {
title: item.find('title').text(),
description: item.find('description').text(),
link: item.find('link').text().split('?utm_source')[0],
author: item.find('author').text(),
pubDate: parseDate(item.find('pubDate').text()),
enclosure_url: enclosure?.attr('url'),
enclosure_length: enclosure?.attr('length'),
enclosure_type: enclosure?.attr('type'),
media: {
content: Object.keys(mediaContent.attribs).reduce((data, key) => {
data[key] = mediaContent.attribs[key];
return data;
}, {}),
thumbnail: thumbnail?.attribs
? Object.keys(thumbnail.attribs).reduce((data, attr) => {
data[attr] = thumbnail.attribs[attr];
return data;
}, {})
: undefined,
},
};
});

const items = await Promise.all(
feed.items.map((item) =>
list.map((item) =>
ctx.cache.tryGet(item.link, async () => {
// Fetch the AMP version
const url = item.link.replace(/^https:\/\/www\.scmp\.com/, 'https://amp.scmp.com');
const response = await got(url, {
headers: {
'User-Agent': chromeMobileUserAgent,
},
});
const html = response.data;
const $ = cheerio.load(html);
const content = $('div.article-body.clearfix');

// Cover
const cover = $('.article-images > amp-carousel > .i-amphtml-slides-container >.i-amphtml-slide-item > amp-img > img');
const { data: response, url } = await got(item.link);

if (cover.length > 0) {
$(`<img src=${cover[0].attribs.content}>`).insertBefore(content[0].childNodes[0]);
$(cover).remove();
if (new URL(url).hostname !== 'www.scmp.com') {
// e.g., https://multimedia.scmp.com/
return item;
}

// Summary
const summary = $('div.article-header__subhead > ul');

// Metadata (categories & updatedAt)
const updatedAt = $('meta[itemprop="dateModified"]').attr('content');
const publishedAt = item.pubDate || $('meta[itemprop="datePublished"]').attr('content');

const categories = $('meta[name="keywords"]')
.attr('content')
.split(',')
.map((c) => c.trim());
const $ = cheerio.load(response);

// Images
content.find('amp-img').each((i, e) => {
const img = $(`<img width="${e.attribs.width}" height="${e.attribs.height}" src="${e.attribs.src}" alt="${e.attribs.alt}">`);
const nextData = JSON.parse($('script#__NEXT_DATA__').text());
const { article } = nextData.props.pageProps.payload.data;

// Caption follows, no need to handle caption
$(img).insertBefore(e);
$(e).remove();
});
// item.nextData = article;

// iframes (youtube videos and interactive elements)
content.find('amp-iframe').each((i, e) => {
if ($(e).find('iframe').length > 0) {
const iframe = $(e).find('iframe')[0];
$(iframe).insertBefore(e);
$(e).remove();
}
});
item.summary = renderHTML(article.summary.json);
item.description = renderHTML(article.subHeadline.json) + renderHTML(article.images.find((i) => i.type === 'leading')) + renderHTML(article.body.json);
item.updated = parseDate(article.updatedDate, 'x');
item.category = [...new Set([...article.topics.map((t) => t.name), ...article.sections.flatMap((t) => t.value.map((v) => v.name)), ...article.keywords.map((k) => k?.split(', '))])];

content.find('div.video-wrapper > amp-iframe').each((i, e) => {
const iframe = $(`<iframe width="${e.attribs.width}" height="${e.attribs.height}" src="${e.attribs.src}">`);
$(iframe).insertBefore(e);
$(e).remove();
});
// N.B. gallery in article is not rendered
// e.g., { type: 'div', attribs: { class: 'scmp-photo-gallery', 'data-gallery-nid': '3239409' }}
// from https://www.scmp.com/news/china/politics/article/3239355/li-keqiang-former-premier-china-dead

// Remove unwanted DOMs
const unwanted_element_selectors = [
'[class*="-advert"]',
'.social-share',
'.article-body-after',
'scmp-chinaR2-early-text',
'.newsletter-widget-wrapper',
'[id*="-tracker"]',
'[class^="advert-"]',
'amp-list',
'.more-on-this',
];
unwanted_element_selectors.forEach((selector) => {
content.find(selector).each((i, e) => {
$(e).remove();
});
});

return {
title: item.title,
id: item.guid,
pubDate: parseDate(publishedAt),
updated: parseDate(updatedAt),
author: item.creator,
link: item.link,
summary: summary.html(),
description: content.html(),
category: categories,
icon: 'https://assets.i-scmp.com/static/img/icons/scmp-icon-256x256.png',
logo: 'https://customerservice.scmp.com/img/logo_scmp@2x.png',
};
return item;
})
)
);

ctx.state.json = {
items,
};

ctx.state.data = {
title: feed.title,
link: feed.link,
description: feed.description,
title: $('channel > title').text(),
link: $('channel > link').text(),
description: $('channel > description').text(),
item: items,
language: 'en-hk',
icon: 'https://assets.i-scmp.com/static/img/icons/scmp-icon-256x256.png',
logo: 'https://customerservice.scmp.com/img/logo_scmp@2x.png',
image: $('channel > image > url').text(),
};
};
60 changes: 60 additions & 0 deletions lib/v2/scmp/utils.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
const renderHTML = (node) => {
if (!node) {
return '';
}
if (Array.isArray(node)) {
return node.map((n) => renderHTML(n)).join('');
}

switch (node.type) {
case 'a':
return `<a ${Object.keys(node.attribs)
.map((key) => `${key}="${node.attribs[key]}"`)
.join(' ')}>${renderHTML(node.children)}</a>`;
case 'div':
return `<div ${
node.attribs
? Object.keys(node.attribs)
.map((key) => `${key}="${node.attribs[key]}"`)
.join(' ')
: ''
}>${renderHTML(node.children)}</div>`;
case 'blockquote-quote':
return `<blockquote>${renderHTML(node.children)}</blockquote>`;
case 'iframe':
return `<iframe ${Object.keys(node.attribs)
.map((key) => `${key}="${node.attribs[key]}"`)
.join(' ')}></iframe>`;
case 'leading':
case 'img':
return `<figure><img ${
node.attribs
? Object.keys(node.attribs)
.map((key) => `${key}="${node.attribs[key]}"`)
.join(' ')
: `url="${node.url}"` // for leading
}><figcaption>${node.attribs?.title ?? node.title}</figcaption></figure>`;
case 'em':
case 'h3':
case 'li':
case 'ol':
case 'ul':
case 'p':
case 'strong':
case 'u':
return `<${node.type}>${renderHTML(node.children)}</${node.type}>`;
case 'text':
return node.data;
case 'script':
case 'inline-ad-slot':
case 'inline-widget':
case 'track-viewed-percentage':
return '';
default:
return `Unhandled type: ${node.type} ${JSON.stringify(node)}`;
}
};

module.exports = {
renderHTML,
};

0 comments on commit 51ac5f8

Please sign in to comment.