Skip to content

Commit

Permalink
feat(proxy): add PAC script support (#14218)
Browse files Browse the repository at this point in the history
* feat(proxy): add PAC script support

* fix pnpm fail

* fix coverage

* fix coverage final

* update docs
  • Loading branch information
JimenezLi authored Jan 13, 2024
1 parent 697a479 commit 0e70524
Show file tree
Hide file tree
Showing 9 changed files with 315 additions and 4 deletions.
2 changes: 2 additions & 0 deletions lib/config.js
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ const calculateValue = () => {
},
proxyStrategy: envs.PROXY_STRATEGY || 'all', // all / on_retry
reverseProxyUrl: envs.REVERSE_PROXY_URL,
pacUri: envs.PAC_URI,
pacScript: envs.PAC_SCRIPT,
// auth
authentication: {
name: envs.HTTP_BASIC_AUTH_NAME || 'usernam3',
Expand Down
75 changes: 75 additions & 0 deletions lib/utils/pac-proxy.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
const config = require('@/config').value;
const logger = require('./logger');

const possibleProtocol = ['http', 'https', 'ftp', 'file', 'data'];

const pacProxy = (pacUri, pacScript, proxyObj) => {
let pacUrlHandler = null;

// Validate PAC_URI / PAC_SCRIPT
if (pacScript) {
if (typeof pacScript === 'string') {
pacUri = 'data:text/javascript;charset=utf-8,' + encodeURIComponent(pacScript);
} else {
logger.error('Invalid PAC_SCRIPT, use PAC_URI instead');
}
}
if (pacUri && typeof pacUri === 'string') {
try {
pacUrlHandler = new URL(pacUri);
} catch (e) {
pacUri = null;
pacUrlHandler = null;
logger.error(`Parse PAC_URI error: ${e.stack}`);
}
} else {
pacUri = null;
}

// Check if PAC_URI has the right protocol
if (pacUri && !possibleProtocol.includes(pacUrlHandler?.protocol?.replace(':', ''))) {
logger.error(`Unsupported PAC protocol: ${pacUrlHandler?.protocol?.replace(':', '')}, expect one of ${possibleProtocol.join(', ')}`);
pacUri = null;
pacUrlHandler = null;
}

// Validate proxyObj
if (pacUrlHandler) {
proxyObj.host = pacUrlHandler.hostname;
proxyObj.port = parseInt(pacUrlHandler.port) || undefined;
proxyObj.protocol = pacUrlHandler.protocol.replace(':', '');
} else {
proxyObj.protocol = proxyObj.host = proxyObj.port = proxyObj.auth = undefined;
}

// Validate PROXY_AUTH
if (proxyObj.auth && pacUrlHandler) {
let promptProxyUri = false;
if (pacUrlHandler.username || pacUrlHandler.password) {
logger.warn('PAC_URI contains username and/or password, ignoring PROXY_AUTH');
proxyObj.auth = undefined;
} else if (!['http:', 'https:'].includes(pacUrlHandler.protocol)) {
logger.warn(`PROXY_AUTH is only supported by HTTP(S) proxies, but got ${pacUrlHandler.protocol}, ignoring`);
proxyObj.auth = undefined;
promptProxyUri = true;
} else {
logger.info('PROXY_AUTH is set and will be used for requests from Node.js. However, requests from puppeteer will not use it');
promptProxyUri = true;
}
if (promptProxyUri) {
logger.info('To get rid of this, set PAC_URI like protocol://username:password@host:port and clear PROXY_{AUTH,PROTOCOL,HOST,PORT}');
}
}

// Compatible with unify-proxy
return {
proxyUri: pacUri,
proxyObj,
proxyUrlHandler: pacUrlHandler,
};
};

module.exports = {
pacProxy,
...pacProxy(config.pacUri, config.pacScript, config.proxy),
};
8 changes: 6 additions & 2 deletions lib/utils/request-wrapper.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
const config = require('@/config').value;
const { proxyUri, proxyObj, proxyUrlHandler } = require('./unify-proxy');
const proxyIsPAC = config.pacUri || config.pacScript;
const { proxyUri, proxyObj, proxyUrlHandler } = proxyIsPAC ? require('./pac-proxy') : require('./unify-proxy');
const logger = require('./logger');
const http = require('http');
const https = require('https');

let agent = null;
if (proxyUri) {
if (proxyIsPAC) {
const { PacProxyAgent } = require('pac-proxy-agent');
agent = new PacProxyAgent(`pac+${proxyUri}`);
} else if (proxyUri) {
if (proxyUri.startsWith('http')) {
const { HttpsProxyAgent } = require('https-proxy-agent');
agent = new HttpsProxyAgent(proxyUri);
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@
"module-alias": "2.2.3",
"notion-to-md": "3.1.1",
"oauth-1.0a": "2.2.6",
"pac-proxy-agent": "7.0.1",
"plist": "3.1.0",
"proxy-chain": "2.4.0",
"puppeteer": "21.7.0",
Expand Down
3 changes: 3 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

82 changes: 82 additions & 0 deletions test/utils/pac-proxy.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
const { pacProxy } = require('../../lib/utils/pac-proxy');

const emptyProxyObj = {
protocol: undefined,
host: undefined,
port: undefined,
auth: undefined,
url_regex: '.*',
};

const effectiveExpect = ({ proxyUri, proxyObj }, expectUri, expectObj) => {
expect(proxyUri).toBe(expectUri);
expect(proxyObj).toEqual(expectObj);
};

describe('pac-proxy', () => {
const nullExpect = (pac) => effectiveExpect(pac, null, emptyProxyObj);
it('pac empty', () => {
nullExpect(pacProxy(null, null, emptyProxyObj));
});
it('pac-uri invalid', () => {
nullExpect(pacProxy('http://inv ild.test', null, emptyProxyObj));
});
it('pac-uri invalid protocol', () => {
nullExpect(pacProxy('socks://rsshub.proxy', null, emptyProxyObj));
});

const httpUri = 'http://rsshub.proxy/pac.pac';
it('pac-uri http', () => {
effectiveExpect(pacProxy(httpUri, null, emptyProxyObj), httpUri, emptyProxyObj);
});

const httpsUri = 'https://rsshub.proxy/pac.pac';
it('pac-uri https', () => {
effectiveExpect(pacProxy(httpsUri, null, emptyProxyObj), httpsUri, emptyProxyObj);
});

const ftpUri = 'ftp://rsshub.proxy:2333';
it('pac-uri ftp', () => {
effectiveExpect(pacProxy(ftpUri, null, emptyProxyObj), ftpUri, emptyProxyObj);
});

const fileUri = 'file:///path/to/pac.pac';
it('pac-uri file', () => {
effectiveExpect(pacProxy(fileUri, null, emptyProxyObj), fileUri, emptyProxyObj);
});

const dataPacScript = "function FindProxyForURL(url, host){return 'DIRECT';}";
const dataUri = 'data:text/javascript;charset=utf-8,' + encodeURIComponent(dataPacScript);
it('pac-script data', () => {
effectiveExpect(pacProxy(null, dataPacScript, emptyProxyObj), dataUri, emptyProxyObj);
});
it('pac-script data invalid type', () => {
effectiveExpect(pacProxy(httpsUri, 1, emptyProxyObj), httpsUri, emptyProxyObj);
});

const httpsObj = { ...emptyProxyObj, protocol: 'https', host: 'rsshub.proxy', port: 2333 };
const httpsAuthUri = 'https://user:pass@rsshub.proxy:2333';
it('pac-uri https auth', () => {
effectiveExpect(pacProxy(httpsAuthUri, null, emptyProxyObj), httpsAuthUri, httpsObj);
});

const httpsAuthObj = { ...httpsObj, auth: 'testtest' };
it('pac proxy-obj https auth', () => {
effectiveExpect(pacProxy(httpsUri, null, httpsAuthObj), httpsUri, httpsAuthObj);
});

const ftpObj = { ...httpsObj, protocol: 'ftp' };
const ftpAuthUri = 'ftp://user:pass@rsshub.proxy:2333';
it('pac-uri ftp auth', () => {
effectiveExpect(pacProxy(ftpAuthUri, null, emptyProxyObj), ftpAuthUri, ftpObj);
});

const ftpAuthObj = { ...ftpObj, auth: 'testtest' };
it('pac-uri ftp auth (invalid)', () => {
effectiveExpect(pacProxy(ftpUri, null, ftpAuthObj), ftpUri, ftpObj);
});

it('pac-uri user@pass override proxy-obj auth', () => {
effectiveExpect(pacProxy(httpsAuthUri, null, httpsAuthObj), httpsAuthUri, httpsObj);
});
});
116 changes: 116 additions & 0 deletions test/utils/request-wrapper.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ require('../../lib/utils/request-wrapper');
let check = () => {};
const simpleResponse = '<rss version="2.0"><channel><item></item></channel></rss>';

beforeEach(() => {
delete process.env.PAC_URI;
delete process.env.PAC_SCRIPT;
});

afterEach(() => {
delete process.env.PROXY_URI;
delete process.env.PROXY_PROTOCOL;
Expand Down Expand Up @@ -215,6 +220,117 @@ describe('got', () => {
await parser.parseURL(url);
});

it('pac-uri http', async () => {
process.env.PAC_URI = 'http://rsshub.proxy:2333';

jest.resetModules();
require('../../lib/utils/request-wrapper');

check = (request) => {
expect(request.agent.constructor.name).toBe('PacProxyAgent');
expect(request.agent.uri.protocol).toBe('http:');
expect(request.agent.uri.host).toBe('rsshub.proxy:2333');
expect(request.agent.uri.hostname).toBe('rsshub.proxy');
expect(request.agent.uri.port).toBe('2333');
};

nock(/rsshub\.test/)
.get('/proxy')
.times(2)
.reply(200, simpleResponse);

await got.get('http://rsshub.test/proxy');
await parser.parseURL('http://rsshub.test/proxy');
});

it('pac-uri https', async () => {
process.env.PAC_URI = 'https://rsshub.proxy:2333';

jest.resetModules();
require('../../lib/utils/request-wrapper');

check = (request) => {
expect(request.agent.constructor.name).toBe('PacProxyAgent');
expect(request.agent.uri.protocol).toBe('https:');
expect(request.agent.uri.host).toBe('rsshub.proxy:2333');
expect(request.agent.uri.hostname).toBe('rsshub.proxy');
expect(request.agent.uri.port).toBe('2333');
};

nock(/rsshub\.test/)
.get('/proxy')
.times(2)
.reply(200, simpleResponse);

await got.get('http://rsshub.test/proxy');
await parser.parseURL('http://rsshub.test/proxy');
});

it('pac-uri ftp', async () => {
process.env.PAC_URI = 'ftp://rsshub.proxy:2333';

jest.resetModules();
require('../../lib/utils/request-wrapper');

check = (request) => {
expect(request.agent.constructor.name).toBe('PacProxyAgent');
expect(request.agent.uri.protocol).toBe('ftp:');
expect(request.agent.uri.host).toBe('rsshub.proxy:2333');
expect(request.agent.uri.hostname).toBe('rsshub.proxy');
expect(request.agent.uri.port).toBe('2333');
};

nock(/rsshub\.test/)
.get('/proxy')
.times(2)
.reply(200, simpleResponse);

await got.get('http://rsshub.test/proxy');
await parser.parseURL('http://rsshub.test/proxy');
});

it('pac-uri file', async () => {
process.env.PAC_URI = 'file:///D:/rsshub/proxy';

jest.resetModules();
require('../../lib/utils/request-wrapper');

check = (request) => {
expect(request.agent.constructor.name).toBe('PacProxyAgent');
expect(request.agent.uri.protocol).toBe('file:');
expect(request.agent.uri.pathname).toBe('/D:/rsshub/proxy');
};

nock(/rsshub\.test/)
.get('/proxy')
.times(2)
.reply(200, simpleResponse);

await got.get('http://rsshub.test/proxy');
await parser.parseURL('http://rsshub.test/proxy');
});

it('pac-script data', async () => {
process.env.PAC_SCRIPT = "function FindProxyForURL(url,host){return 'DIRECT';}";

jest.resetModules();
require('../../lib/utils/request-wrapper');

check = (request) => {
expect(request.agent.constructor.name).toBe('PacProxyAgent');
expect(request.agent.uri.protocol).toBe('data:');
expect(request.agent.uri.pathname).toBe("text/javascript;charset=utf-8,function%20FindProxyForURL(url%2Chost)%7Breturn%20'DIRECT'%3B%7D");
};

nock(/rsshub\.test/)
.get('/proxy')
.times(2)
.reply(200, simpleResponse);

await got.get('http://rsshub.test/proxy');
await parser.parseURL('http://rsshub.test/proxy');
});

it('auth', async () => {
process.env.PROXY_AUTH = 'testtest';
process.env.PROXY_PROTOCOL = 'http'; // only http(s) proxies extract auth from Headers
Expand Down
16 changes: 15 additions & 1 deletion website/docs/install/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ RSSHub supports two caching methods: memory and redis

Partial routes have a strict anti-crawler policy, and can be configured to use proxy.

Proxy can be configured through **Proxy URI**, **Proxy options**, or **Reverse proxy**.
Proxy can be configured through **Proxy URI**, **Proxy options**, **PAC script**, or **Reverse proxy**.

### Proxy URI

Expand Down Expand Up @@ -96,6 +96,20 @@ async function handleRequest(request) {
}
```

### PAC script

:::warning

This proxy method overwrites `PROXY_URI`, `PROXY_PROTOCOL`, `PROXY_HOST` and `PROXY_PORT`.

:::

About PAC script, please refer to [Proxy Auto-Configuration (PAC) file](https://developer.mozilla.org/docs/Web/HTTP/Proxy_servers_and_tunneling/Proxy_Auto-Configuration_PAC_file).

`PAC_URI`: PAC script URL, supports http, https, ftp, file, data. See [pac-proxy-agent](https://www.npmjs.com/package/pac-proxy-agent) NPM package page.

`PAC_SCRIPT`: Hard-coded JavaScript code string of PAC script. Overwrites `PAC_URI`.

### Proxy options

`PROXY_PROTOCOL`: Using proxy, supports socks, http, https, etc. See [socks-proxy-agent](https://www.npmjs.com/package/socks-proxy-agent) NPM package page and [source](https://github.com/TooTallNate/proxy-agents/blob/63adbcefdb4783cc67c0eb90200886b4064e8639/packages/socks-proxy-agent/src/index.ts#L81) for what these protocols mean. See also [cURL OOTW: SOCKS5](https://daniel.haxx.se/blog/2020/05/26/curl-ootw-socks5/) for reference.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ RSSHub 支持 `memory` 和 `redis` 两种缓存方式

部分路由反爬严格,可以配置使用代理抓取。

可通过**代理 URI ****代理选项****反向代理**三种方式来配置代理
可通过**代理 URI** **代理选项****代理自动配置文件 (PAC)****反向代理**等方式来配置代理

### 代理 URI

Expand Down Expand Up @@ -72,6 +72,20 @@ RSSHub 支持 `memory` 和 `redis` 两种缓存方式

`PROXY_URL_REGEX`: 启用代理的 URL 正则表达式,默认全部开启 `.*`

### 代理自动配置文件 (PAC)

:::warning

该方法会覆盖 `PROXY_URI`, `PROXY_PROTOCOL`, `PROXY_HOST` 以及 `PROXY_PORT`

:::

关于代理自动配置文件 (PAC),请查看[代理自动配置文件(PAC)文件](https://developer.mozilla.org/docs/Web/HTTP/Proxy_servers_and_tunneling/Proxy_Auto-Configuration_PAC_file)

`PAC_URI`: PAC 文件 URI,支持 http, https, ftp, file, data。具体以 [pac-proxy-agent](https://www.npmjs.com/package/pac-proxy-agent) NPM 包的支持为准。

`PAC_SCRIPT`: 硬编码的 PAC 脚本字符串。覆盖 `PAC_URI`

### 反向代理

:::warning
Expand Down

0 comments on commit 0e70524

Please sign in to comment.