From cf1e1b83d771e35ce4faa86def654c5fc8e7c271 Mon Sep 17 00:00:00 2001
From: Inga Ulusoy <inga.ulusoy@uni-heidelberg.de>
Date: Thu, 30 Mar 2023 11:23:01 +0200
Subject: [PATCH] add text summary

---
 misinformation/test/test_text.py | 12 ++++++++++++
 misinformation/text.py           |  9 +++++++++
 pyproject.toml                   |  1 +
 3 files changed, 22 insertions(+)

diff --git a/misinformation/test/test_text.py b/misinformation/test/test_text.py
index 8543504..a102ed5 100644
--- a/misinformation/test/test_text.py
+++ b/misinformation/test/test_text.py
@@ -116,6 +116,18 @@ def test_sentiment_analysis():
     assert test_obj.subdict["subjectivity"] == 0.6
 
 
+def test_text_summary(get_path):
+    mydict = {}
+    test_obj = tt.TextDetector(mydict, analyse_text=True)
+    ref_file = get_path + "example_summary.txt"
+    with open(ref_file, "r", encoding="utf8") as file:
+        reference_text = file.read()
+    test_obj.subdict["text_english"] = reference_text
+    test_obj.text_summary()
+    reference_summary = " I’m sorry, but I don’t want to be an emperor. That’s not my business. I should like to help everyone - if possible - Jew, Gentile - black man - white . We all want to help one another. In this world there is room for everyone. The way of life can be free and beautiful, but we have lost the way ."
+    assert mydict["summary_text"] == reference_summary
+
+
 def test_PostprocessText(set_testdict, get_path):
     reference_dict = "THE\nALGEBRAIC\nEIGENVALUE\nPROBLEM\nDOM\nNVS TIO\nMINA\nMonographs\non Numerical Analysis\nJ.. H. WILKINSON"
     reference_df = "Mathematische Formelsammlung\nfür Ingenieure und Naturwissenschaftler\nMit zahlreichen Abbildungen und Rechenbeispielen\nund einer ausführlichen Integraltafel\n3., verbesserte Auflage"
diff --git a/misinformation/text.py b/misinformation/text.py
index 9466f30..9f3c776 100644
--- a/misinformation/text.py
+++ b/misinformation/text.py
@@ -9,6 +9,7 @@ from misinformation import utils
 import grpc
 import pandas as pd
 from bertopic import BERTopic
+from transformers import pipeline
 
 # make widgets work again
 # clean text has weird spaces and separation of "do n't"
@@ -119,6 +120,14 @@ class TextDetector(utils.AnalysisMethod):
         # where 0.0 is very objective and 1.0 is very subjective
         self.subdict["subjectivity"] = self.doc._.blob.subjectivity
 
+    def text_summary(self):
+        # use the transformers pipeline to summarize the text
+        pipe = pipeline("summarization")
+        self.subdict.update(pipe(self.subdict["text_english"])[0])
+
+    # def text_sentiment_transformers(self):
+    # pipe = pipeline("text-classification")
+
 
 class PostprocessText:
     def __init__(
diff --git a/pyproject.toml b/pyproject.toml
index fb15c17..87246a8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,6 +48,7 @@ dependencies = [
     "tensorflow",
     "textblob",
     "torch",
+    "transformers",
     "google-cloud-vision",
     "setuptools",
     "opencv-contrib-python",