amélioration extractor (suppression du bruit)

Этот коммит содержится в:
BartM82 2025-07-28 13:59:43 +00:00
родитель ad32c2b63f
Коммит 769df8eb5d
3 изменённых файлов: 29 добавлений и 9 удалений

Просмотреть файл

@ -176,8 +176,7 @@ const DIMA_TECHNIQUES = [
"same as",
"cela rappelle",
"déjà vu",
"par ailleurs",
"contexte",
"dans ce contexte",
],
weight: 0.9,
type: "technique",
@ -493,11 +492,6 @@ const DIMA_TECHNIQUES = [
"stéréotype",
"les étrangers",
"les immigrants",
"all the",
"toujours",
"always",
"jamais",
"never",
"en général",
"in general",
"les français",
@ -853,7 +847,6 @@ const DIMA_TECHNIQUES = [
"confident",
"sûr",
"sure",
"certain",
"capable",
"expert",
"maîtrise",

Просмотреть файл

@ -130,8 +130,21 @@ class ContentExtractor {
"advertisement",
"social",
"share",
"cookie", "popup", "modal", "overlay", "banner", "newsletter",
"related", "suggest", "recommend", "widget", "promo", "promotion",
"comment", "rating", "review", "breadcrumb", "pagination", "tag",
"metadata", "byline", "author-bio", "subscription", "paywall"
];
const skipIds = ["nav", "menu", "footer", "header", "sidebar", "comments","cookie-banner", "newsletter", "popup", "modal", "overlay",
"related-articles", "advertisement", "social-sharing"];
const skipAttributes = [
'data-module="Advertisement"',
'data-component="SocialShare"',
'data-track-component="Newsletter"',
'role="banner"',
'role="navigation"',
'role="complementary"'
];
const skipIds = ["nav", "menu", "footer", "header", "sidebar", "comments"];
const className = element.className?.toLowerCase() || "";
const id = element.id?.toLowerCase() || "";
@ -139,7 +152,10 @@ class ContentExtractor {
return (
skipClasses.some((skip) => className.includes(skip)) ||
skipIds.some((skip) => id.includes(skip)) ||
skipAttributes.some((attr) => element.getAttribute(attr.split('=')[0]) === attr.split('=')[1]?.replace(/"/g, '')) ||
element.getAttribute("aria-hidden") === "true" ||
element.getAttribute("role") === "banner" ||
element.getAttribute("role") === "navigation" ||
getComputedStyle(element).display === "none"
);
}

Просмотреть файл

@ -378,6 +378,17 @@ class TechniqueAnalyzer {
return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
calculatePositionWeight(position, textLength) {
const relativePosition = position / textLength;
// Boost pour les éléments en début de texte (titres, accroches)
if (relativePosition < 0.15) return 1.4;
// Boost modéré pour le premier tiers
if (relativePosition < 0.33) return 1.1;
// Boost pour la fin (conclusions, appels à l'action)
if (relativePosition > 0.85) return 1.2;
// Poids normal pour le milieu
return 1.0;
}
calculateRiskLevel(score) {
if (score < 15) return "Faible";
if (score < 30) return "Modéré";