From cf1e1b83d771e35ce4faa86def654c5fc8e7c271 Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Thu, 30 Mar 2023 11:23:01 +0200 Subject: [PATCH] add text summary --- misinformation/test/test_text.py | 12 ++++++++++++ misinformation/text.py | 9 +++++++++ pyproject.toml | 1 + 3 files changed, 22 insertions(+) diff --git a/misinformation/test/test_text.py b/misinformation/test/test_text.py index 8543504..a102ed5 100644 --- a/misinformation/test/test_text.py +++ b/misinformation/test/test_text.py @@ -116,6 +116,18 @@ def test_sentiment_analysis(): assert test_obj.subdict["subjectivity"] == 0.6 +def test_text_summary(get_path): + mydict = {} + test_obj = tt.TextDetector(mydict, analyse_text=True) + ref_file = get_path + "example_summary.txt" + with open(ref_file, "r", encoding="utf8") as file: + reference_text = file.read() + test_obj.subdict["text_english"] = reference_text + test_obj.text_summary() + reference_summary = " I’m sorry, but I don’t want to be an emperor. That’s not my business. I should like to help everyone - if possible - Jew, Gentile - black man - white . We all want to help one another. In this world there is room for everyone. The way of life can be free and beautiful, but we have lost the way ." + assert mydict["summary_text"] == reference_summary + + def test_PostprocessText(set_testdict, get_path): reference_dict = "THE\nALGEBRAIC\nEIGENVALUE\nPROBLEM\nDOM\nNVS TIO\nMINA\nMonographs\non Numerical Analysis\nJ.. H. WILKINSON" reference_df = "Mathematische Formelsammlung\nfür Ingenieure und Naturwissenschaftler\nMit zahlreichen Abbildungen und Rechenbeispielen\nund einer ausführlichen Integraltafel\n3., verbesserte Auflage" diff --git a/misinformation/text.py b/misinformation/text.py index 9466f30..9f3c776 100644 --- a/misinformation/text.py +++ b/misinformation/text.py @@ -9,6 +9,7 @@ from misinformation import utils import grpc import pandas as pd from bertopic import BERTopic +from transformers import pipeline # make widgets work again # clean text has weird spaces and separation of "do n't" @@ -119,6 +120,14 @@ class TextDetector(utils.AnalysisMethod): # where 0.0 is very objective and 1.0 is very subjective self.subdict["subjectivity"] = self.doc._.blob.subjectivity + def text_summary(self): + # use the transformers pipeline to summarize the text + pipe = pipeline("summarization") + self.subdict.update(pipe(self.subdict["text_english"])[0]) + + # def text_sentiment_transformers(self): + # pipe = pipeline("text-classification") + class PostprocessText: def __init__( diff --git a/pyproject.toml b/pyproject.toml index fb15c17..87246a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ dependencies = [ "tensorflow", "textblob", "torch", + "transformers", "google-cloud-vision", "setuptools", "opencv-contrib-python",