зеркало из
https://github.com/M82-project/DIMA.git
synced 2025-10-29 13:06:08 +02:00
Relocates the plugin files from the "fichiers à télécharger" directory to a "releases" directory. This change ensures a cleaner separation between development files and release artifacts. Updates the CI workflow to reflect the new location for zipping and committing the plugin archive.
185 строки
4.9 KiB
JavaScript
185 строки
4.9 KiB
JavaScript
// Content Extractor Module
|
|
// Responsible for extracting and cleaning content from web pages
|
|
|
|
class ContentExtractor {
|
|
constructor(settings) {
|
|
this.settings = settings || {
|
|
maxContentLength: 5000,
|
|
minKeywordLength: 3,
|
|
debugMode: false,
|
|
};
|
|
}
|
|
|
|
log(message, data = null) {
|
|
if (this.settings.debugMode) {
|
|
console.log(`ContentExtractor: ${message}`, data || "");
|
|
}
|
|
}
|
|
|
|
extractTitle() {
|
|
const titleSources = [
|
|
() => document.title,
|
|
() => document.querySelector('meta[property="og:title"]')?.content,
|
|
() => document.querySelector('meta[name="twitter:title"]')?.content,
|
|
() => document.querySelector("h1")?.textContent?.trim(),
|
|
() =>
|
|
document
|
|
.querySelector('.title, .headline, [class*="title"]')
|
|
?.textContent?.trim(),
|
|
];
|
|
|
|
return titleSources
|
|
.map((fn) => fn())
|
|
.filter(Boolean)
|
|
.join(" ")
|
|
.substring(0, 500)
|
|
.trim();
|
|
}
|
|
|
|
extractContent() {
|
|
this.log("Début extraction de contenu...");
|
|
|
|
const extractedTexts = new Set();
|
|
let content = "";
|
|
|
|
// Sélecteurs prioritaires pour le contenu principal
|
|
const contentSelectors = [
|
|
"article",
|
|
'[role="main"]',
|
|
"main",
|
|
".article-content, .post-content, .entry-content",
|
|
".content, .story-body, .article-body",
|
|
"#article-body, .post-body, .text-content",
|
|
];
|
|
|
|
// Extraction du contenu principal
|
|
for (const selector of contentSelectors) {
|
|
const elements = document.querySelectorAll(selector);
|
|
if (elements.length > 0) {
|
|
this.log(`Contenu trouvé avec: ${selector}`);
|
|
content += this.extractTextFromElements(elements, extractedTexts);
|
|
if (content.length > 1000) break;
|
|
}
|
|
}
|
|
|
|
// Fallback si contenu insuffisant
|
|
if (content.length < 300) {
|
|
this.log("Contenu insuffisant, utilisation de fallbacks...");
|
|
const fallbackSelectors = [
|
|
"p, h1, h2, h3, h4, h5, h6",
|
|
".text, .description, .summary",
|
|
'[class*="content"], [class*="text"]',
|
|
"blockquote, figcaption",
|
|
];
|
|
|
|
for (const selector of fallbackSelectors) {
|
|
const elements = document.querySelectorAll(selector);
|
|
content += this.extractTextFromElements(elements, extractedTexts, 30);
|
|
if (content.length > 1500) break;
|
|
}
|
|
}
|
|
|
|
// Dernier recours
|
|
if (content.length < 200) {
|
|
this.log("Dernier recours - texte visible");
|
|
const bodyText = this.cleanText(document.body.innerText);
|
|
content = bodyText.substring(0, this.settings.maxContentLength);
|
|
}
|
|
|
|
const finalContent = content
|
|
.substring(0, this.settings.maxContentLength)
|
|
.trim();
|
|
this.log(`Extraction terminée: ${finalContent.length} caractères`);
|
|
|
|
return finalContent;
|
|
}
|
|
|
|
extractTextFromElements(elements, extractedTexts, maxElements = 100) {
|
|
let text = "";
|
|
const elementsArray = Array.from(elements).slice(0, maxElements);
|
|
|
|
for (const element of elementsArray) {
|
|
if (this.shouldSkipElement(element)) continue;
|
|
|
|
const elementText = this.cleanText(
|
|
element.textContent || element.innerText
|
|
);
|
|
if (
|
|
elementText &&
|
|
elementText.length > 15 &&
|
|
!extractedTexts.has(elementText)
|
|
) {
|
|
extractedTexts.add(elementText);
|
|
text += elementText + " ";
|
|
|
|
if (text.length > this.settings.maxContentLength) break;
|
|
}
|
|
}
|
|
|
|
return text;
|
|
}
|
|
|
|
shouldSkipElement(element) {
|
|
const skipClasses = [
|
|
"nav",
|
|
"menu",
|
|
"footer",
|
|
"header",
|
|
"sidebar",
|
|
"ad",
|
|
"advertisement",
|
|
"social",
|
|
"share",
|
|
];
|
|
const skipIds = ["nav", "menu", "footer", "header", "sidebar", "comments"];
|
|
|
|
const className = element.className?.toLowerCase() || "";
|
|
const id = element.id?.toLowerCase() || "";
|
|
|
|
return (
|
|
skipClasses.some((skip) => className.includes(skip)) ||
|
|
skipIds.some((skip) => id.includes(skip)) ||
|
|
element.getAttribute("aria-hidden") === "true" ||
|
|
getComputedStyle(element).display === "none"
|
|
);
|
|
}
|
|
|
|
cleanText(text) {
|
|
if (!text) return "";
|
|
|
|
return text
|
|
.replace(/\s+/g, " ")
|
|
.replace(/[\r\n\t]/g, " ")
|
|
.replace(/[^\w\s\.,!?;:()\-'"%àâäéèêëïîôöùûüÿç]/gi, "")
|
|
.trim();
|
|
}
|
|
|
|
detectPageType() {
|
|
const url = window.location.href.toLowerCase();
|
|
if (
|
|
url.includes("news") ||
|
|
url.includes("article") ||
|
|
url.includes("actualit")
|
|
)
|
|
return "news";
|
|
if (url.includes("blog")) return "blog";
|
|
if (
|
|
url.includes("facebook") ||
|
|
url.includes("twitter") ||
|
|
url.includes("instagram")
|
|
)
|
|
return "social";
|
|
if (
|
|
url.includes("shop") ||
|
|
url.includes("buy") ||
|
|
url.includes("product") ||
|
|
url.includes("commerce")
|
|
)
|
|
return "commerce";
|
|
return "general";
|
|
}
|
|
}
|
|
|
|
// Make ContentExtractor available globally for Chrome extension
|
|
window.ContentExtractor = ContentExtractor;
|