diff --git a/lib/config.js b/lib/config.js index a786fd58da16b5..81f9b2c6cdc8c1 100644 --- a/lib/config.js +++ b/lib/config.js @@ -76,6 +76,8 @@ const calculateValue = () => { }, proxyStrategy: envs.PROXY_STRATEGY || 'all', // all / on_retry reverseProxyUrl: envs.REVERSE_PROXY_URL, + pacUri: envs.PAC_URI, + pacScript: envs.PAC_SCRIPT, // auth authentication: { name: envs.HTTP_BASIC_AUTH_NAME || 'usernam3', diff --git a/lib/utils/pac-proxy.js b/lib/utils/pac-proxy.js new file mode 100644 index 00000000000000..ca5926773f4c57 --- /dev/null +++ b/lib/utils/pac-proxy.js @@ -0,0 +1,75 @@ +const config = require('@/config').value; +const logger = require('./logger'); + +const possibleProtocol = ['http', 'https', 'ftp', 'file', 'data']; + +const pacProxy = (pacUri, pacScript, proxyObj) => { + let pacUrlHandler = null; + + // Validate PAC_URI / PAC_SCRIPT + if (pacScript) { + if (typeof pacScript === 'string') { + pacUri = 'data:text/javascript;charset=utf-8,' + encodeURIComponent(pacScript); + } else { + logger.error('Invalid PAC_SCRIPT, use PAC_URI instead'); + } + } + if (pacUri && typeof pacUri === 'string') { + try { + pacUrlHandler = new URL(pacUri); + } catch (e) { + pacUri = null; + pacUrlHandler = null; + logger.error(`Parse PAC_URI error: ${e.stack}`); + } + } else { + pacUri = null; + } + + // Check if PAC_URI has the right protocol + if (pacUri && !possibleProtocol.includes(pacUrlHandler?.protocol?.replace(':', ''))) { + logger.error(`Unsupported PAC protocol: ${pacUrlHandler?.protocol?.replace(':', '')}, expect one of ${possibleProtocol.join(', ')}`); + pacUri = null; + pacUrlHandler = null; + } + + // Validate proxyObj + if (pacUrlHandler) { + proxyObj.host = pacUrlHandler.hostname; + proxyObj.port = parseInt(pacUrlHandler.port) || undefined; + proxyObj.protocol = pacUrlHandler.protocol.replace(':', ''); + } else { + proxyObj.protocol = proxyObj.host = proxyObj.port = proxyObj.auth = undefined; + } + + // Validate PROXY_AUTH + if (proxyObj.auth && pacUrlHandler) { + let promptProxyUri = false; + if (pacUrlHandler.username || pacUrlHandler.password) { + logger.warn('PAC_URI contains username and/or password, ignoring PROXY_AUTH'); + proxyObj.auth = undefined; + } else if (!['http:', 'https:'].includes(pacUrlHandler.protocol)) { + logger.warn(`PROXY_AUTH is only supported by HTTP(S) proxies, but got ${pacUrlHandler.protocol}, ignoring`); + proxyObj.auth = undefined; + promptProxyUri = true; + } else { + logger.info('PROXY_AUTH is set and will be used for requests from Node.js. However, requests from puppeteer will not use it'); + promptProxyUri = true; + } + if (promptProxyUri) { + logger.info('To get rid of this, set PAC_URI like protocol://username:password@host:port and clear PROXY_{AUTH,PROTOCOL,HOST,PORT}'); + } + } + + // Compatible with unify-proxy + return { + proxyUri: pacUri, + proxyObj, + proxyUrlHandler: pacUrlHandler, + }; +}; + +module.exports = { + pacProxy, + ...pacProxy(config.pacUri, config.pacScript, config.proxy), +}; diff --git a/lib/utils/request-wrapper.js b/lib/utils/request-wrapper.js index 6120df3a4fc932..c50ac0d789ee54 100644 --- a/lib/utils/request-wrapper.js +++ b/lib/utils/request-wrapper.js @@ -1,11 +1,15 @@ const config = require('@/config').value; -const { proxyUri, proxyObj, proxyUrlHandler } = require('./unify-proxy'); +const proxyIsPAC = config.pacUri || config.pacScript; +const { proxyUri, proxyObj, proxyUrlHandler } = proxyIsPAC ? require('./pac-proxy') : require('./unify-proxy'); const logger = require('./logger'); const http = require('http'); const https = require('https'); let agent = null; -if (proxyUri) { +if (proxyIsPAC) { + const { PacProxyAgent } = require('pac-proxy-agent'); + agent = new PacProxyAgent(`pac+${proxyUri}`); +} else if (proxyUri) { if (proxyUri.startsWith('http')) { const { HttpsProxyAgent } = require('https-proxy-agent'); agent = new HttpsProxyAgent(proxyUri); diff --git a/package.json b/package.json index 19d37401b4cfd8..ac07d5adf5a673 100644 --- a/package.json +++ b/package.json @@ -128,6 +128,7 @@ "module-alias": "2.2.3", "notion-to-md": "3.1.1", "oauth-1.0a": "2.2.6", + "pac-proxy-agent": "7.0.1", "plist": "3.1.0", "proxy-chain": "2.4.0", "puppeteer": "21.7.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 17ed871b70ed10..56128a6e71f3b2 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -140,6 +140,9 @@ dependencies: oauth-1.0a: specifier: 2.2.6 version: 2.2.6 + pac-proxy-agent: + specifier: 7.0.1 + version: 7.0.1 plist: specifier: 3.1.0 version: 3.1.0 diff --git a/test/utils/pac-proxy.js b/test/utils/pac-proxy.js new file mode 100644 index 00000000000000..d98d0607ecd166 --- /dev/null +++ b/test/utils/pac-proxy.js @@ -0,0 +1,82 @@ +const { pacProxy } = require('../../lib/utils/pac-proxy'); + +const emptyProxyObj = { + protocol: undefined, + host: undefined, + port: undefined, + auth: undefined, + url_regex: '.*', +}; + +const effectiveExpect = ({ proxyUri, proxyObj }, expectUri, expectObj) => { + expect(proxyUri).toBe(expectUri); + expect(proxyObj).toEqual(expectObj); +}; + +describe('pac-proxy', () => { + const nullExpect = (pac) => effectiveExpect(pac, null, emptyProxyObj); + it('pac empty', () => { + nullExpect(pacProxy(null, null, emptyProxyObj)); + }); + it('pac-uri invalid', () => { + nullExpect(pacProxy('http://inv ild.test', null, emptyProxyObj)); + }); + it('pac-uri invalid protocol', () => { + nullExpect(pacProxy('socks://rsshub.proxy', null, emptyProxyObj)); + }); + + const httpUri = 'http://rsshub.proxy/pac.pac'; + it('pac-uri http', () => { + effectiveExpect(pacProxy(httpUri, null, emptyProxyObj), httpUri, emptyProxyObj); + }); + + const httpsUri = 'https://rsshub.proxy/pac.pac'; + it('pac-uri https', () => { + effectiveExpect(pacProxy(httpsUri, null, emptyProxyObj), httpsUri, emptyProxyObj); + }); + + const ftpUri = 'ftp://rsshub.proxy:2333'; + it('pac-uri ftp', () => { + effectiveExpect(pacProxy(ftpUri, null, emptyProxyObj), ftpUri, emptyProxyObj); + }); + + const fileUri = 'file:///path/to/pac.pac'; + it('pac-uri file', () => { + effectiveExpect(pacProxy(fileUri, null, emptyProxyObj), fileUri, emptyProxyObj); + }); + + const dataPacScript = "function FindProxyForURL(url, host){return 'DIRECT';}"; + const dataUri = 'data:text/javascript;charset=utf-8,' + encodeURIComponent(dataPacScript); + it('pac-script data', () => { + effectiveExpect(pacProxy(null, dataPacScript, emptyProxyObj), dataUri, emptyProxyObj); + }); + it('pac-script data invalid type', () => { + effectiveExpect(pacProxy(httpsUri, 1, emptyProxyObj), httpsUri, emptyProxyObj); + }); + + const httpsObj = { ...emptyProxyObj, protocol: 'https', host: 'rsshub.proxy', port: 2333 }; + const httpsAuthUri = 'https://user:pass@rsshub.proxy:2333'; + it('pac-uri https auth', () => { + effectiveExpect(pacProxy(httpsAuthUri, null, emptyProxyObj), httpsAuthUri, httpsObj); + }); + + const httpsAuthObj = { ...httpsObj, auth: 'testtest' }; + it('pac proxy-obj https auth', () => { + effectiveExpect(pacProxy(httpsUri, null, httpsAuthObj), httpsUri, httpsAuthObj); + }); + + const ftpObj = { ...httpsObj, protocol: 'ftp' }; + const ftpAuthUri = 'ftp://user:pass@rsshub.proxy:2333'; + it('pac-uri ftp auth', () => { + effectiveExpect(pacProxy(ftpAuthUri, null, emptyProxyObj), ftpAuthUri, ftpObj); + }); + + const ftpAuthObj = { ...ftpObj, auth: 'testtest' }; + it('pac-uri ftp auth (invalid)', () => { + effectiveExpect(pacProxy(ftpUri, null, ftpAuthObj), ftpUri, ftpObj); + }); + + it('pac-uri user@pass override proxy-obj auth', () => { + effectiveExpect(pacProxy(httpsAuthUri, null, httpsAuthObj), httpsAuthUri, httpsObj); + }); +}); diff --git a/test/utils/request-wrapper.js b/test/utils/request-wrapper.js index c9b5ae166da221..75e4abebdfab9e 100644 --- a/test/utils/request-wrapper.js +++ b/test/utils/request-wrapper.js @@ -5,6 +5,11 @@ require('../../lib/utils/request-wrapper'); let check = () => {}; const simpleResponse = ''; +beforeEach(() => { + delete process.env.PAC_URI; + delete process.env.PAC_SCRIPT; +}); + afterEach(() => { delete process.env.PROXY_URI; delete process.env.PROXY_PROTOCOL; @@ -215,6 +220,117 @@ describe('got', () => { await parser.parseURL(url); }); + it('pac-uri http', async () => { + process.env.PAC_URI = 'http://rsshub.proxy:2333'; + + jest.resetModules(); + require('../../lib/utils/request-wrapper'); + + check = (request) => { + expect(request.agent.constructor.name).toBe('PacProxyAgent'); + expect(request.agent.uri.protocol).toBe('http:'); + expect(request.agent.uri.host).toBe('rsshub.proxy:2333'); + expect(request.agent.uri.hostname).toBe('rsshub.proxy'); + expect(request.agent.uri.port).toBe('2333'); + }; + + nock(/rsshub\.test/) + .get('/proxy') + .times(2) + .reply(200, simpleResponse); + + await got.get('http://rsshub.test/proxy'); + await parser.parseURL('http://rsshub.test/proxy'); + }); + + it('pac-uri https', async () => { + process.env.PAC_URI = 'https://rsshub.proxy:2333'; + + jest.resetModules(); + require('../../lib/utils/request-wrapper'); + + check = (request) => { + expect(request.agent.constructor.name).toBe('PacProxyAgent'); + expect(request.agent.uri.protocol).toBe('https:'); + expect(request.agent.uri.host).toBe('rsshub.proxy:2333'); + expect(request.agent.uri.hostname).toBe('rsshub.proxy'); + expect(request.agent.uri.port).toBe('2333'); + }; + + nock(/rsshub\.test/) + .get('/proxy') + .times(2) + .reply(200, simpleResponse); + + await got.get('http://rsshub.test/proxy'); + await parser.parseURL('http://rsshub.test/proxy'); + }); + + it('pac-uri ftp', async () => { + process.env.PAC_URI = 'ftp://rsshub.proxy:2333'; + + jest.resetModules(); + require('../../lib/utils/request-wrapper'); + + check = (request) => { + expect(request.agent.constructor.name).toBe('PacProxyAgent'); + expect(request.agent.uri.protocol).toBe('ftp:'); + expect(request.agent.uri.host).toBe('rsshub.proxy:2333'); + expect(request.agent.uri.hostname).toBe('rsshub.proxy'); + expect(request.agent.uri.port).toBe('2333'); + }; + + nock(/rsshub\.test/) + .get('/proxy') + .times(2) + .reply(200, simpleResponse); + + await got.get('http://rsshub.test/proxy'); + await parser.parseURL('http://rsshub.test/proxy'); + }); + + it('pac-uri file', async () => { + process.env.PAC_URI = 'file:///D:/rsshub/proxy'; + + jest.resetModules(); + require('../../lib/utils/request-wrapper'); + + check = (request) => { + expect(request.agent.constructor.name).toBe('PacProxyAgent'); + expect(request.agent.uri.protocol).toBe('file:'); + expect(request.agent.uri.pathname).toBe('/D:/rsshub/proxy'); + }; + + nock(/rsshub\.test/) + .get('/proxy') + .times(2) + .reply(200, simpleResponse); + + await got.get('http://rsshub.test/proxy'); + await parser.parseURL('http://rsshub.test/proxy'); + }); + + it('pac-script data', async () => { + process.env.PAC_SCRIPT = "function FindProxyForURL(url,host){return 'DIRECT';}"; + + jest.resetModules(); + require('../../lib/utils/request-wrapper'); + + check = (request) => { + expect(request.agent.constructor.name).toBe('PacProxyAgent'); + expect(request.agent.uri.protocol).toBe('data:'); + expect(request.agent.uri.pathname).toBe("text/javascript;charset=utf-8,function%20FindProxyForURL(url%2Chost)%7Breturn%20'DIRECT'%3B%7D"); + }; + + nock(/rsshub\.test/) + .get('/proxy') + .times(2) + .reply(200, simpleResponse); + + await got.get('http://rsshub.test/proxy'); + await parser.parseURL('http://rsshub.test/proxy'); + }); + it('auth', async () => { process.env.PROXY_AUTH = 'testtest'; process.env.PROXY_PROTOCOL = 'http'; // only http(s) proxies extract auth from Headers diff --git a/website/docs/install/config.md b/website/docs/install/config.md index 20d524a2dc6d19..85bc93d0650c4f 100644 --- a/website/docs/install/config.md +++ b/website/docs/install/config.md @@ -40,7 +40,7 @@ RSSHub supports two caching methods: memory and redis Partial routes have a strict anti-crawler policy, and can be configured to use proxy. -Proxy can be configured through **Proxy URI**, **Proxy options**, or **Reverse proxy**. +Proxy can be configured through **Proxy URI**, **Proxy options**, **PAC script**, or **Reverse proxy**. ### Proxy URI @@ -96,6 +96,20 @@ async function handleRequest(request) { } ``` +### PAC script + +:::warning + +This proxy method overwrites `PROXY_URI`, `PROXY_PROTOCOL`, `PROXY_HOST` and `PROXY_PORT`. + +::: + +About PAC script, please refer to [Proxy Auto-Configuration (PAC) file](https://developer.mozilla.org/docs/Web/HTTP/Proxy_servers_and_tunneling/Proxy_Auto-Configuration_PAC_file). + +`PAC_URI`: PAC script URL, supports http, https, ftp, file, data. See [pac-proxy-agent](https://www.npmjs.com/package/pac-proxy-agent) NPM package page. + +`PAC_SCRIPT`: Hard-coded JavaScript code string of PAC script. Overwrites `PAC_URI`. + ### Proxy options `PROXY_PROTOCOL`: Using proxy, supports socks, http, https, etc. See [socks-proxy-agent](https://www.npmjs.com/package/socks-proxy-agent) NPM package page and [source](https://github.com/TooTallNate/proxy-agents/blob/63adbcefdb4783cc67c0eb90200886b4064e8639/packages/socks-proxy-agent/src/index.ts#L81) for what these protocols mean. See also [cURL OOTW: SOCKS5](https://daniel.haxx.se/blog/2020/05/26/curl-ootw-socks5/) for reference. diff --git a/website/i18n/zh/docusaurus-plugin-content-docs/current/install/config.md b/website/i18n/zh/docusaurus-plugin-content-docs/current/install/config.md index 531b2d22b8685c..0145b4105ae6fa 100644 --- a/website/i18n/zh/docusaurus-plugin-content-docs/current/install/config.md +++ b/website/i18n/zh/docusaurus-plugin-content-docs/current/install/config.md @@ -40,7 +40,7 @@ RSSHub 支持 `memory` 和 `redis` 两种缓存方式 部分路由反爬严格,可以配置使用代理抓取。 -可通过**代理 URI **或**代理选项**或**反向代理**三种方式来配置代理。 +可通过**代理 URI** 或**代理选项**或**代理自动配置文件 (PAC)** 或**反向代理**等方式来配置代理。 ### 代理 URI @@ -72,6 +72,20 @@ RSSHub 支持 `memory` 和 `redis` 两种缓存方式 `PROXY_URL_REGEX`: 启用代理的 URL 正则表达式,默认全部开启 `.*` +### 代理自动配置文件 (PAC) + +:::warning + +该方法会覆盖 `PROXY_URI`, `PROXY_PROTOCOL`, `PROXY_HOST` 以及 `PROXY_PORT`。 + +::: + +关于代理自动配置文件 (PAC),请查看[代理自动配置文件(PAC)文件](https://developer.mozilla.org/docs/Web/HTTP/Proxy_servers_and_tunneling/Proxy_Auto-Configuration_PAC_file)。 + +`PAC_URI`: PAC 文件 URI,支持 http, https, ftp, file, data。具体以 [pac-proxy-agent](https://www.npmjs.com/package/pac-proxy-agent) NPM 包的支持为准。 + +`PAC_SCRIPT`: 硬编码的 PAC 脚本字符串。覆盖 `PAC_URI`。 + ### 反向代理 :::warning