Skip to content

Commit

Permalink
refactor(route/xiaohongshu): merge helper methods to util
Browse files Browse the repository at this point in the history
  • Loading branch information
pseudoyu committed Nov 22, 2024
1 parent 1ec9ab9 commit 63fae03
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 208 deletions.
107 changes: 4 additions & 103 deletions lib/routes/xiaohongshu/user.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import { Route, ViewType } from '@/types';
import cache from '@/utils/cache';
import { getUser } from './util';
import { getUser, renderNotesFulltext, getUserWithCookie } from './util';
import InvalidParameterError from '@/errors/types/invalid-parameter';
import ofetch from '@/utils/ofetch';
import { load } from 'cheerio';
import { config } from '@/config';

export const route: Route = {
Expand Down Expand Up @@ -69,14 +67,14 @@ async function handler(ctx) {
};
} catch {
// Fallback to normal logic if cookie method fails
return await getUserFeedWithoutCookie(url, category);
return await getUserFeeds(url, category);
}
} else {
return await getUserFeedWithoutCookie(url, category);
return await getUserFeeds(url, category);
}
}

async function getUserFeedWithoutCookie(url: string, category: string) {
async function getUserFeeds(url: string, category: string) {
const {
userPageData: { basicInfo, interactions, tags },
notes,
Expand Down Expand Up @@ -125,100 +123,3 @@ async function getUserFeedWithoutCookie(url: string, category: string) {
item: category === 'notes' ? renderNote(notes) : renderCollect(collect),
};
}

async function renderNotesFulltext(notes, urlPrex) {
const data: Array<{
title: string;
link: string;
description: string;
author: string;
guid: string;
pubDate: Date;
}> = [];
const promises = notes.flatMap((note) =>
note.map(async ({ noteCard, id }) => {
const link = `${urlPrex}/${id}`;
const { title, description, pubDate } = await getFullNote(link);
return {
title,
link,
description,
author: noteCard.user.nickName,
guid: noteCard.noteId,
pubDate,
};
})
);
data.push(...(await Promise.all(promises)));
return data;
}

async function getFullNote(link) {
const cookie = config.xiaohongshu.cookie;
const data = (await cache.tryGet(link, async () => {
const res = await ofetch(link, {
headers: cookie
? {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
Cookie: cookie,
}
: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
},
});
const $ = load(res);
let script = $('script')
.filter((i, script) => {
const text = script.children[0]?.data;
return text?.startsWith('window.__INITIAL_STATE__=');
})
.text();
script = script.slice('window.__INITIAL_STATE__='.length);
script = script.replaceAll('undefined', 'null');
const state = JSON.parse(script);
const note = state.note.noteDetailMap[state.note.firstNoteId].note;
const images = note.imageList.map((image) => image.urlDefault);
const title = note.title;
let desc = note.desc;
desc = desc.replaceAll(/\[.*?\]/g, '');
desc = desc.replaceAll(/#(.*?)#/g, '#$1');
desc = desc.replaceAll('\n', '<br>');
const pubDate = new Date(note.time);
const description = `${images.map((image) => `<img src="${image}">`).join('')}<br>${title}<br>${desc}`;
return {
title,
description,
pubDate,
};
})) as Promise<{ title: string; description: string; pubDate: Date }>;
return data;
}

async function getUserWithCookie(url: string, cookie: string) {
const res = await ofetch(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
Cookie: cookie,
},
});
const $ = load(res);
const paths = $('#userPostedFeeds > section > div > a.cover.ld.mask').map((i, item) => item.attributes[3].value);
let script = $('script')
.filter((i, script) => {
const text = script.children[0]?.data;
return text?.startsWith('window.__INITIAL_STATE__=');
})
.text();
script = script.slice('window.__INITIAL_STATE__='.length);
script = script.replaceAll('undefined', 'null');
const state = JSON.parse(script);
let index = 0;
for (const item of state.user.notes.flat()) {
const path = paths[index];
if (path && path.includes('?')) {
item.id = item.id + path?.substring(path.indexOf('?'));
}
index = index + 1;
}
return state.user;
}
217 changes: 112 additions & 105 deletions lib/routes/xiaohongshu/util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,30 @@ import { config } from '@/config';
import logger from '@/utils/logger';
import { parseDate } from '@/utils/parse-date';
import puppeteer from '@/utils/puppeteer';
import { ofetch } from 'ofetch';
import { load } from 'cheerio';
import cache from '@/utils/cache';

// Common headers for requests
const getHeaders = (cookie?: string) => ({
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
Connection: 'keep-alive',
Host: 'www.xiaohongshu.com',
Pragma: 'no-cache',
'Sec-Ch-Ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
...(cookie ? { Cookie: cookie } : {}),
});

const getUser = (url, cache) =>
cache.tryGet(
Expand All @@ -23,7 +47,7 @@ const getUser = (url, cache) =>
});
await page.waitForSelector('div.reds-tab-item:nth-child(2)');

const initialState = await page.evaluate(() => window.__INITIAL_STATE__);
const initialState = await page.evaluate(() => (window as any).__INITIAL_STATE__);

if (!(await page.$('.lock-icon'))) {
await page.click('div.reds-tab-item:nth-child(2)');
Expand Down Expand Up @@ -68,7 +92,7 @@ const getBoard = (url, cache) =>
logger.http(`Requesting ${url}`);
await page.goto(url);
await page.waitForSelector('.pc-container');
const initialSsrState = await page.evaluate(() => window.__INITIAL_SSR_STATE__);
const initialSsrState = await page.evaluate(() => (window as any).__INITIAL_SSR_STATE__);
return initialSsrState.Main;
} finally {
browser.close();
Expand All @@ -78,108 +102,6 @@ const getBoard = (url, cache) =>
false
);

const setPageFilter = async (page) => {
await page.setRequestInterception(true);
page.on('request', (req) => {
req.resourceType() === 'document' || req.resourceType() === 'script' || req.resourceType() === 'xhr' || req.resourceType() === 'other' ? req.continue() : req.abort();
});
};

const getNotes = (url, cache) =>
cache.tryGet(
url + '/notes', // To avoid mixing with the cache for `user.js`
async () => {
let user = '';
let notes = [];

const browser = await puppeteer({ stealth: true });
try {
const page = await browser.newPage();
await setPageFilter(page);

logger.http(`Requesting ${url}`);
await page.goto(url);

let otherInfo = {};
let userPosted = {};
try {
[otherInfo, userPosted] = await Promise.all(
['/api/sns/web/v1/user/otherinfo', '/api/sns/web/v1/user_posted'].map((p) =>
page
.waitForResponse((res) => {
const req = res.request();
return req.url().includes(p) && req.method() === 'GET';
})
.then((r) => r.json())
)
);
} catch (error) {
throw new Error(`Could not get user information and note list\n${error}`);
}

await page.close();

// Get full text for each note
const notesPromise = userPosted.data.notes.map((n) => {
const noteUrl = url + '/' + n.note_id;

return cache.tryGet(noteUrl, async () => {
const notePage = await browser.newPage();
await setPageFilter(notePage);

logger.http(`Requesting ${noteUrl}`);
await notePage.goto(noteUrl);

let feed = {};
try {
feed = await notePage.evaluate(() => window.__INITIAL_STATE__);

// Sometimes the page is not server-side rendered
if (feed?.note?.note === undefined || JSON.stringify(feed?.note?.note) === '{}') {
const res = await notePage.waitForResponse((res) => {
const req = res.request();
return req.url().includes('/api/sns/web/v1/feed') && req.method() === 'POST';
});

const json = await res.json();
const note_card = json.data.items[0].note_card;
feed.note.note = {
title: note_card.title,
noteId: note_card.id,
desc: note_card.desc,
tagList: note_card.tag_list,
imageList: note_card.image_list,
user: note_card.user,
time: note_card.time,
lastUpdateTime: note_card.last_update_time,
};
}
} catch (error) {
throw new Error(`Could not get note ${n.note_id}\n${error}`);
}

await notePage.close();

if (feed?.note?.note !== undefined && JSON.stringify(feed?.note?.note) !== '{}') {
return feed.note.note;
} else {
throw new Error(`Could not get note ${n.note_id}`);
}
});
});

user = otherInfo.data.basic_info;
notes = await Promise.all(notesPromise);
} finally {
await browser.close();
}

return { user, notes };
},
config.cache.routeExpire,
false
);

const formatText = (text) => text.replaceAll(/(\r\n|\r|\n)/g, '<br>').replaceAll('\t', '&emsp;');

// tag_list.id has nothing to do with its url
Expand All @@ -196,4 +118,89 @@ const formatNote = (url, note) => ({
updated: parseDate(note.lastUpdateTime, 'x'),
});

export { getUser, getBoard, getNotes, formatText, formatNote };
async function renderNotesFulltext(notes, urlPrex) {
const data: Array<{
title: string;
link: string;
description: string;
author: string;
guid: string;
pubDate: Date;
}> = [];
const promises = notes.flatMap((note) =>
note.map(async ({ noteCard, id }) => {
const link = `${urlPrex}/${id}`;
const { title, description, pubDate } = await getFullNote(link);
return {
title,
link,
description,
author: noteCard.user.nickName,
guid: noteCard.noteId,
pubDate,
};
})
);
data.push(...(await Promise.all(promises)));
return data;
}

async function getFullNote(link) {
const data = (await cache.tryGet(link, async () => {
const res = await ofetch(link, {
headers: getHeaders(config.xiaohongshu.cookie),
});
const $ = load(res);
const script = extractInitialState($);
const state = JSON.parse(script);
const note = state.note.noteDetailMap[state.note.firstNoteId].note;
const images = note.imageList.map((image) => image.urlDefault);
const title = note.title;
let desc = note.desc;
desc = desc.replaceAll(/\[.*?\]/g, '');
desc = desc.replaceAll(/#(.*?)#/g, '#$1');
desc = desc.replaceAll('\n', '<br>');
const pubDate = new Date(note.time);
const description = `${images.map((image) => `<img src="${image}">`).join('')}<br>${title}<br>${desc}`;
return {
title,
description,
pubDate,
};
})) as Promise<{ title: string; description: string; pubDate: Date }>;
return data;
}

async function getUserWithCookie(url: string, cookie: string) {
const res = await ofetch(url, {
headers: getHeaders(cookie),
});
const $ = load(res);
const paths = $('#userPostedFeeds > section > div > a.cover.ld.mask').map((i, item) => item.attributes[3].value);
const script = extractInitialState($);
const state = JSON.parse(script);
let index = 0;
for (const item of state.user.notes.flat()) {
const path = paths[index];
if (path && path.includes('?')) {
item.id = item.id + path?.substring(path.indexOf('?'));
}
index = index + 1;
}
return state.user;
}

// Add helper function to extract initial state
function extractInitialState($) {
let script = $('script')
.filter((i, script) => {
const text = script.children[0]?.data;
return text?.startsWith('window.__INITIAL_STATE__=');
})
.text();
script = script.slice('window.__INITIAL_STATE__='.length);
script = script.replaceAll('undefined', 'null');
return script;
}

export { getUser, getBoard, formatText, formatNote, renderNotesFulltext, getFullNote, getUserWithCookie };

0 comments on commit 63fae03

Please sign in to comment.