Skip to content

Commit

Permalink
fix(parser): remove footnotes and improve URL extraction
Browse files Browse the repository at this point in the history
Added regex to remove footnotes in data-purge-module.
  • Loading branch information
gentlementlegen committed Nov 19, 2024
1 parent ab0b831 commit ab2f9be
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 1 deletion.
2 changes: 2 additions & 0 deletions src/parser/data-purge-module.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ export class DataPurgeModule extends BaseModule {
.replace(/^\/.+/g, "")
// Remove HTML comments
.replace(/<!--[\s\S]*?-->/g, "")
// Remove the footnotes
.replace(/^###### .*?\[\^\d+\^][\s\S]*$/gm, "")
// Keep only one new line needed by markdown-it package to convert to html
.replace(/\n\s*\n/g, "\n")
.trim();
Expand Down
9 changes: 8 additions & 1 deletion src/parser/formatting-evaluator-module.ts
Original file line number Diff line number Diff line change
Expand Up @@ -175,11 +175,18 @@ export class FormattingEvaluatorModule extends BaseModule {
urlSet.add(url.split("#")[0]);
}
} else {
const bodyContent = element.textContent;
const urlPattern = /https?:\/\/\S+/g;
const matches = bodyContent?.match(urlPattern);
matches?.map((url) => url.split("#")[0]).forEach((url) => urlSet.add(url));
this._updateTagCount(formatting, tagName, score);
}
}
console.log(urlSet);
urlSet.forEach(() => {
this._updateTagCount(formatting, "a", this._multipliers[commentType].html["a"].score ?? 0);
});
const words = this._countWordsFromRegex(htmlElement.textContent ?? "", this._multipliers[commentType]?.wordValue);

return { formatting, words };
}

Expand Down

0 comments on commit ab2f9be

Please sign in to comment.