зеркало из
https://github.com/M82-project/DIMA.git
synced 2025-10-29 13:06:08 +02:00
201 строка
5.7 KiB
JavaScript
201 строка
5.7 KiB
JavaScript
// Content Extractor Module
|
|
// Responsible for extracting and cleaning content from web pages
|
|
|
|
class ContentExtractor {
|
|
constructor(settings) {
|
|
this.settings = settings || {
|
|
maxContentLength: 5000,
|
|
minKeywordLength: 3,
|
|
debugMode: false,
|
|
};
|
|
}
|
|
|
|
log(message, data = null) {
|
|
if (this.settings.debugMode) {
|
|
console.log(`ContentExtractor: ${message}`, data || "");
|
|
}
|
|
}
|
|
|
|
extractTitle() {
|
|
const titleSources = [
|
|
() => document.title,
|
|
() => document.querySelector('meta[property="og:title"]')?.content,
|
|
() => document.querySelector('meta[name="twitter:title"]')?.content,
|
|
() => document.querySelector("h1")?.textContent?.trim(),
|
|
() =>
|
|
document
|
|
.querySelector('.title, .headline, [class*="title"]')
|
|
?.textContent?.trim(),
|
|
];
|
|
|
|
return titleSources
|
|
.map((fn) => fn())
|
|
.filter(Boolean)
|
|
.join(" ")
|
|
.substring(0, 500)
|
|
.trim();
|
|
}
|
|
|
|
extractContent() {
|
|
this.log("Début extraction de contenu...");
|
|
|
|
const extractedTexts = new Set();
|
|
let content = "";
|
|
|
|
// Sélecteurs prioritaires pour le contenu principal
|
|
const contentSelectors = [
|
|
"article",
|
|
'[role="main"]',
|
|
"main",
|
|
".article-content, .post-content, .entry-content",
|
|
".content, .story-body, .article-body",
|
|
"#article-body, .post-body, .text-content",
|
|
];
|
|
|
|
// Extraction du contenu principal
|
|
for (const selector of contentSelectors) {
|
|
const elements = document.querySelectorAll(selector);
|
|
if (elements.length > 0) {
|
|
this.log(`Contenu trouvé avec: ${selector}`);
|
|
content += this.extractTextFromElements(elements, extractedTexts);
|
|
if (content.length > 1000) break;
|
|
}
|
|
}
|
|
|
|
// Fallback si contenu insuffisant
|
|
if (content.length < 300) {
|
|
this.log("Contenu insuffisant, utilisation de fallbacks...");
|
|
const fallbackSelectors = [
|
|
"p, h1, h2, h3, h4, h5, h6",
|
|
".text, .description, .summary",
|
|
'[class*="content"], [class*="text"]',
|
|
"blockquote, figcaption",
|
|
];
|
|
|
|
for (const selector of fallbackSelectors) {
|
|
const elements = document.querySelectorAll(selector);
|
|
content += this.extractTextFromElements(elements, extractedTexts, 30);
|
|
if (content.length > 1500) break;
|
|
}
|
|
}
|
|
|
|
// Dernier recours
|
|
if (content.length < 200) {
|
|
this.log("Dernier recours - texte visible");
|
|
const bodyText = this.cleanText(document.body.innerText);
|
|
content = bodyText.substring(0, this.settings.maxContentLength);
|
|
}
|
|
|
|
const finalContent = content
|
|
.substring(0, this.settings.maxContentLength)
|
|
.trim();
|
|
this.log(`Extraction terminée: ${finalContent.length} caractères`);
|
|
|
|
return finalContent;
|
|
}
|
|
|
|
extractTextFromElements(elements, extractedTexts, maxElements = 100) {
|
|
let text = "";
|
|
const elementsArray = Array.from(elements).slice(0, maxElements);
|
|
|
|
for (const element of elementsArray) {
|
|
if (this.shouldSkipElement(element)) continue;
|
|
|
|
const elementText = this.cleanText(
|
|
element.textContent || element.innerText
|
|
);
|
|
if (
|
|
elementText &&
|
|
elementText.length > 15 &&
|
|
!extractedTexts.has(elementText)
|
|
) {
|
|
extractedTexts.add(elementText);
|
|
text += elementText + " ";
|
|
|
|
if (text.length > this.settings.maxContentLength) break;
|
|
}
|
|
}
|
|
|
|
return text;
|
|
}
|
|
|
|
shouldSkipElement(element) {
|
|
const skipClasses = [
|
|
"nav",
|
|
"menu",
|
|
"footer",
|
|
"header",
|
|
"sidebar",
|
|
"ad",
|
|
"advertisement",
|
|
"social",
|
|
"share",
|
|
"cookie", "popup", "modal", "overlay", "banner", "newsletter",
|
|
"related", "suggest", "recommend", "widget", "promo", "promotion",
|
|
"comment", "rating", "review", "breadcrumb", "pagination", "tag",
|
|
"metadata", "byline", "author-bio", "subscription", "paywall"
|
|
];
|
|
const skipIds = ["nav", "menu", "footer", "header", "sidebar", "comments","cookie-banner", "newsletter", "popup", "modal", "overlay",
|
|
"related-articles", "advertisement", "social-sharing"];
|
|
const skipAttributes = [
|
|
'data-module="Advertisement"',
|
|
'data-component="SocialShare"',
|
|
'data-track-component="Newsletter"',
|
|
'role="banner"',
|
|
'role="navigation"',
|
|
'role="complementary"'
|
|
];
|
|
|
|
const className = element.className?.toLowerCase() || "";
|
|
const id = element.id?.toLowerCase() || "";
|
|
|
|
return (
|
|
skipClasses.some((skip) => className.includes(skip)) ||
|
|
skipIds.some((skip) => id.includes(skip)) ||
|
|
skipAttributes.some((attr) => element.getAttribute(attr.split('=')[0]) === attr.split('=')[1]?.replace(/"/g, '')) ||
|
|
element.getAttribute("aria-hidden") === "true" ||
|
|
element.getAttribute("role") === "banner" ||
|
|
element.getAttribute("role") === "navigation" ||
|
|
getComputedStyle(element).display === "none"
|
|
);
|
|
}
|
|
|
|
cleanText(text) {
|
|
if (!text) return "";
|
|
|
|
return text
|
|
.replace(/\s+/g, " ")
|
|
.replace(/[\r\n\t]/g, " ")
|
|
.replace(/[^\w\s\.,!?;:()\-'"%àâäéèêëïîôöùûüÿç]/gi, "")
|
|
.trim();
|
|
}
|
|
|
|
detectPageType() {
|
|
const url = window.location.href.toLowerCase();
|
|
if (
|
|
url.includes("news") ||
|
|
url.includes("article") ||
|
|
url.includes("actualit")
|
|
)
|
|
return "news";
|
|
if (url.includes("blog")) return "blog";
|
|
if (
|
|
url.includes("facebook") ||
|
|
url.includes("twitter") ||
|
|
url.includes("instagram")
|
|
)
|
|
return "social";
|
|
if (
|
|
url.includes("shop") ||
|
|
url.includes("buy") ||
|
|
url.includes("product") ||
|
|
url.includes("commerce")
|
|
)
|
|
return "commerce";
|
|
return "general";
|
|
}
|
|
}
|
|
|
|
// Make ContentExtractor available globally for Chrome extension
|
|
window.ContentExtractor = ContentExtractor;
|