зеркало из
				https://github.com/M82-project/DIMA.git
				synced 2025-10-30 21:46:07 +02:00 
			
		
		
		
	
		
			
				
	
	
		
			201 строка
		
	
	
		
			5.7 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
			
		
		
	
	
			201 строка
		
	
	
		
			5.7 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
| // Content Extractor Module
 | |
| // Responsible for extracting and cleaning content from web pages
 | |
| 
 | |
| class ContentExtractor {
 | |
|   constructor(settings) {
 | |
|     this.settings = settings || {
 | |
|       maxContentLength: 5000,
 | |
|       minKeywordLength: 3,
 | |
|       debugMode: false,
 | |
|     };
 | |
|   }
 | |
| 
 | |
|   log(message, data = null) {
 | |
|     if (this.settings.debugMode) {
 | |
|       console.log(`ContentExtractor: ${message}`, data || "");
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   extractTitle() {
 | |
|     const titleSources = [
 | |
|       () => document.title,
 | |
|       () => document.querySelector('meta[property="og:title"]')?.content,
 | |
|       () => document.querySelector('meta[name="twitter:title"]')?.content,
 | |
|       () => document.querySelector("h1")?.textContent?.trim(),
 | |
|       () =>
 | |
|         document
 | |
|           .querySelector('.title, .headline, [class*="title"]')
 | |
|           ?.textContent?.trim(),
 | |
|     ];
 | |
| 
 | |
|     return titleSources
 | |
|       .map((fn) => fn())
 | |
|       .filter(Boolean)
 | |
|       .join(" ")
 | |
|       .substring(0, 500)
 | |
|       .trim();
 | |
|   }
 | |
| 
 | |
|   extractContent() {
 | |
|     this.log("Début extraction de contenu...");
 | |
| 
 | |
|     const extractedTexts = new Set();
 | |
|     let content = "";
 | |
| 
 | |
|     // Sélecteurs prioritaires pour le contenu principal
 | |
|     const contentSelectors = [
 | |
|       "article",
 | |
|       '[role="main"]',
 | |
|       "main",
 | |
|       ".article-content, .post-content, .entry-content",
 | |
|       ".content, .story-body, .article-body",
 | |
|       "#article-body, .post-body, .text-content",
 | |
|     ];
 | |
| 
 | |
|     // Extraction du contenu principal
 | |
|     for (const selector of contentSelectors) {
 | |
|       const elements = document.querySelectorAll(selector);
 | |
|       if (elements.length > 0) {
 | |
|         this.log(`Contenu trouvé avec: ${selector}`);
 | |
|         content += this.extractTextFromElements(elements, extractedTexts);
 | |
|         if (content.length > 1000) break;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     // Fallback si contenu insuffisant
 | |
|     if (content.length < 300) {
 | |
|       this.log("Contenu insuffisant, utilisation de fallbacks...");
 | |
|       const fallbackSelectors = [
 | |
|         "p, h1, h2, h3, h4, h5, h6",
 | |
|         ".text, .description, .summary",
 | |
|         '[class*="content"], [class*="text"]',
 | |
|         "blockquote, figcaption",
 | |
|       ];
 | |
| 
 | |
|       for (const selector of fallbackSelectors) {
 | |
|         const elements = document.querySelectorAll(selector);
 | |
|         content += this.extractTextFromElements(elements, extractedTexts, 30);
 | |
|         if (content.length > 1500) break;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     // Dernier recours
 | |
|     if (content.length < 200) {
 | |
|       this.log("Dernier recours - texte visible");
 | |
|       const bodyText = this.cleanText(document.body.innerText);
 | |
|       content = bodyText.substring(0, this.settings.maxContentLength);
 | |
|     }
 | |
| 
 | |
|     const finalContent = content
 | |
|       .substring(0, this.settings.maxContentLength)
 | |
|       .trim();
 | |
|     this.log(`Extraction terminée: ${finalContent.length} caractères`);
 | |
| 
 | |
|     return finalContent;
 | |
|   }
 | |
| 
 | |
|   extractTextFromElements(elements, extractedTexts, maxElements = 100) {
 | |
|     let text = "";
 | |
|     const elementsArray = Array.from(elements).slice(0, maxElements);
 | |
| 
 | |
|     for (const element of elementsArray) {
 | |
|       if (this.shouldSkipElement(element)) continue;
 | |
| 
 | |
|       const elementText = this.cleanText(
 | |
|         element.textContent || element.innerText
 | |
|       );
 | |
|       if (
 | |
|         elementText &&
 | |
|         elementText.length > 15 &&
 | |
|         !extractedTexts.has(elementText)
 | |
|       ) {
 | |
|         extractedTexts.add(elementText);
 | |
|         text += elementText + " ";
 | |
| 
 | |
|         if (text.length > this.settings.maxContentLength) break;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     return text;
 | |
|   }
 | |
| 
 | |
|   shouldSkipElement(element) {
 | |
|     const skipClasses = [
 | |
|       "nav",
 | |
|       "menu",
 | |
|       "footer",
 | |
|       "header",
 | |
|       "sidebar",
 | |
|       "ad",
 | |
|       "advertisement",
 | |
|       "social",
 | |
|       "share",
 | |
|       "cookie", "popup", "modal", "overlay", "banner", "newsletter",
 | |
|     "related", "suggest", "recommend", "widget", "promo", "promotion",
 | |
|     "comment", "rating", "review", "breadcrumb", "pagination", "tag",
 | |
|     "metadata", "byline", "author-bio", "subscription", "paywall"
 | |
|     ];
 | |
|     const skipIds = ["nav", "menu", "footer", "header", "sidebar", "comments","cookie-banner", "newsletter", "popup", "modal", "overlay",
 | |
|     "related-articles", "advertisement", "social-sharing"];
 | |
|     const skipAttributes = [
 | |
|     'data-module="Advertisement"',
 | |
|     'data-component="SocialShare"', 
 | |
|     'data-track-component="Newsletter"',
 | |
|     'role="banner"',
 | |
|     'role="navigation"',
 | |
|     'role="complementary"'
 | |
|     ];
 | |
| 
 | |
|     const className = element.className?.toLowerCase() || "";
 | |
|     const id = element.id?.toLowerCase() || "";
 | |
| 
 | |
|     return (
 | |
|       skipClasses.some((skip) => className.includes(skip)) ||
 | |
|       skipIds.some((skip) => id.includes(skip)) ||
 | |
|       skipAttributes.some((attr) => element.getAttribute(attr.split('=')[0]) === attr.split('=')[1]?.replace(/"/g, '')) ||
 | |
|       element.getAttribute("aria-hidden") === "true" ||
 | |
|       element.getAttribute("role") === "banner" ||
 | |
|       element.getAttribute("role") === "navigation" ||
 | |
|       getComputedStyle(element).display === "none"
 | |
|     );
 | |
|   }
 | |
| 
 | |
|   cleanText(text) {
 | |
|     if (!text) return "";
 | |
| 
 | |
|     return text
 | |
|       .replace(/\s+/g, " ")
 | |
|       .replace(/[\r\n\t]/g, " ")
 | |
|       .replace(/[^\w\s\.,!?;:()\-'"%àâäéèêëïîôöùûüÿç]/gi, "")
 | |
|       .trim();
 | |
|   }
 | |
| 
 | |
|   detectPageType() {
 | |
|     const url = window.location.href.toLowerCase();
 | |
|     if (
 | |
|       url.includes("news") ||
 | |
|       url.includes("article") ||
 | |
|       url.includes("actualit")
 | |
|     )
 | |
|       return "news";
 | |
|     if (url.includes("blog")) return "blog";
 | |
|     if (
 | |
|       url.includes("facebook") ||
 | |
|       url.includes("twitter") ||
 | |
|       url.includes("instagram")
 | |
|     )
 | |
|       return "social";
 | |
|     if (
 | |
|       url.includes("shop") ||
 | |
|       url.includes("buy") ||
 | |
|       url.includes("product") ||
 | |
|       url.includes("commerce")
 | |
|     )
 | |
|       return "commerce";
 | |
|     return "general";
 | |
|   }
 | |
| }
 | |
| 
 | |
| // Make ContentExtractor available globally for Chrome extension
 | |
| window.ContentExtractor = ContentExtractor;
 | 
