From ab2f9be0f8e70ab5c7f7784a72e269ded83cb2f0 Mon Sep 17 00:00:00 2001 From: gentlementlegen Date: Tue, 19 Nov 2024 20:16:54 +0900 Subject: [PATCH] fix(parser): remove footnotes and improve URL extraction Added regex to remove footnotes in data-purge-module. --- src/parser/data-purge-module.ts | 2 ++ src/parser/formatting-evaluator-module.ts | 9 ++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/parser/data-purge-module.ts b/src/parser/data-purge-module.ts index 15a41b3b..81bac909 100644 --- a/src/parser/data-purge-module.ts +++ b/src/parser/data-purge-module.ts @@ -33,6 +33,8 @@ export class DataPurgeModule extends BaseModule { .replace(/^\/.+/g, "") // Remove HTML comments .replace(//g, "") + // Remove the footnotes + .replace(/^###### .*?\[\^\d+\^][\s\S]*$/gm, "") // Keep only one new line needed by markdown-it package to convert to html .replace(/\n\s*\n/g, "\n") .trim(); diff --git a/src/parser/formatting-evaluator-module.ts b/src/parser/formatting-evaluator-module.ts index 72a5ed62..f6c4cea9 100644 --- a/src/parser/formatting-evaluator-module.ts +++ b/src/parser/formatting-evaluator-module.ts @@ -175,11 +175,18 @@ export class FormattingEvaluatorModule extends BaseModule { urlSet.add(url.split("#")[0]); } } else { + const bodyContent = element.textContent; + const urlPattern = /https?:\/\/\S+/g; + const matches = bodyContent?.match(urlPattern); + matches?.map((url) => url.split("#")[0]).forEach((url) => urlSet.add(url)); this._updateTagCount(formatting, tagName, score); } } - console.log(urlSet); + urlSet.forEach(() => { + this._updateTagCount(formatting, "a", this._multipliers[commentType].html["a"].score ?? 0); + }); const words = this._countWordsFromRegex(htmlElement.textContent ?? "", this._multipliers[commentType]?.wordValue); + return { formatting, words }; }