diff --git a/package.json b/package.json index 3384ba4..c785a73 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "warcio", - "version": "2.4.0", + "version": "2.4.1", "keywords": [ "WARC", "web archiving" diff --git a/src/lib/statusandheaders.ts b/src/lib/statusandheaders.ts index 14122e8..1a46346 100644 --- a/src/lib/statusandheaders.ts +++ b/src/lib/statusandheaders.ts @@ -1,4 +1,4 @@ -import { concatChunks, HeadersMultiMap, splitChunk } from "./utils"; +import { concatChunks, HeadersMultiMap, latin1ToUTF, splitChunk, UTFToLatin1 } from "./utils"; import { type AsyncIterReader } from "./readers"; export const CRLF = new Uint8Array([13, 10]); @@ -220,24 +220,6 @@ function splitRemainder(str: string, sep: string, limit: number) { return newParts; } -// =========================================================================== -function UTFToLatin1(value: string) { - const buf = new TextEncoder().encode(value); - - let str = ""; - buf.forEach((x) => (str += String.fromCharCode(x))); - return str; -} - -// =========================================================================== -function latin1ToUTF(str: string) { - const buf = new Uint8Array(str.length); - for (let i = 0; i < str.length; i++) { - buf[i] = str.charCodeAt(i) & 0xff; - } - return new TextDecoder().decode(buf); -} - // =========================================================================== export async function indexOfDoubleCRLF( buffer: Uint8Array, diff --git a/src/lib/utils.ts b/src/lib/utils.ts index 51a995e..9bfe9e7 100644 --- a/src/lib/utils.ts +++ b/src/lib/utils.ts @@ -289,6 +289,24 @@ export function splitChunk( return [chunk.slice(0, inx), chunk.slice(inx)]; } +// =========================================================================== +export function UTFToLatin1(value: string) { + const buf = new TextEncoder().encode(value); + + let str = ""; + buf.forEach((x) => (str += String.fromCharCode(x))); + return str; +} + +// =========================================================================== +export function latin1ToUTF(str: string) { + const buf = new Uint8Array(str.length); + for (let i = 0; i < str.length; i++) { + buf[i] = str.charCodeAt(i) & 0xff; + } + return new TextDecoder().decode(buf); +} + // =========================================================================== // headers multi map const MULTI_VALUE_ALLOWED = ["set-cookie", "warc-concurrent-to", "warc-protocol"]; @@ -297,6 +315,13 @@ const MULTI_VALUE_ALLOWED = ["set-cookie", "warc-concurrent-to", "warc-protocol" // in theory, collision still possible with arbitrary cookie value const JOIN_MARKER = ",,,"; +export function multiValueHeader(name: string, value: string[]) { + if (!MULTI_VALUE_ALLOWED.includes(name.toLowerCase())) { + throw new Error("not a valid multi value header"); + } + return value.join(JOIN_MARKER); +} + export class HeadersMultiMap extends Map { constructor(headersInit?: HeadersInit) { // if an array of array, parse that and add individually here diff --git a/test/testSerializer.test.ts b/test/testSerializer.test.ts index d085a3b..43c12ab 100644 --- a/test/testSerializer.test.ts +++ b/test/testSerializer.test.ts @@ -5,6 +5,7 @@ import { WARCSerializer as BaseWARCSerializer, } from "../src/lib"; import { WARCSerializer } from "../src/node/warcserializer"; +import { multiValueHeader } from "../src/lib/utils"; const decoder = new TextDecoder("utf-8"); const encoder = new TextEncoder(); @@ -429,12 +430,13 @@ set-cookie: greeting=hello, name=world\r\n\ } }); - test("create request record with cookie array, keep headers case", async () => { - const url = "http://example.com/"; + test("create request record with protocol + cookie array, keep headers case", async () => { + const url = "https://example.com/"; const date = "2000-01-01T00:00:00Z"; const type = "request"; const warcHeaders = { "WARC-Record-ID": "", + "WARC-Protocol": multiValueHeader("WARC-Protocol", ["h2", "tls/1.0"]) }; const httpHeaders: [string, string][] = [ ["Set-Cookie", "greeting=hello"], @@ -463,7 +465,9 @@ set-cookie: greeting=hello, name=world\r\n\ "\ WARC/1.0\r\n\ WARC-Record-ID: \r\n\ -WARC-Target-URI: http://example.com/\r\n\ +WARC-Protocol: h2\r\n\ +WARC-Protocol: tls/1.0\r\n\ +WARC-Target-URI: https://example.com/\r\n\ WARC-Date: 2000-01-01T00:00:00Z\r\n\ WARC-Type: request\r\n\ Content-Type: application/http; msgtype=request\r\n\ @@ -484,7 +488,9 @@ Set-Cookie: name=world\r\n\ "\ WARC/1.0\r\n\ WARC-Record-ID: \r\n\ -WARC-Target-URI: http://example.com/\r\n\ +WARC-Protocol: h2\r\n\ +WARC-Protocol: tls/1.0\r\n\ +WARC-Target-URI: https://example.com/\r\n\ WARC-Date: 2000-01-01T00:00:00Z\r\n\ WARC-Type: request\r\n\ Content-Type: application/http; msgtype=request\r\n\