Text 2 (#42)

* start with translate * translate and clean - notebook * spacy model in requirements * translate in module * clean in module * upload coverage only for ubuntu * update ubuntu version on runner * update dependencies * start tests for text * skip gcv test * fix age * more text tests * more text tests * add comment * test translation * fix numpy version; add reference data for trans * use utf-8 for windows
2025-10-29 13:06:04 +02:00 · 2022-12-19 15:03:05 +01:00 · 2022-12-19 15:03:05 +01:00 · 25ed5881a1
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -14,12 +14,11 @@ jobs:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ubuntu-20.04, windows-latest]
+        os: [ubuntu-22.04, windows-latest]
        python-version: [3.9]
    steps:
    - name: Checkout repository
      uses: actions/checkout@v2
-
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v2
      with:
@ -28,11 +27,13 @@ jobs:
      run: |
        python -m pip install --upgrade pip
        python -m pip install -e .
+        python -m spacy download en_core_web_md
    - name: Run pytest
      run: |
        cd misinformation
-        python -m pytest -s -m "not imageai" --cov=. --cov-report=xml
+        python -m pytest -s -m "not (imageai or gcv)" --cov=. --cov-report=xml
    - name: Upload coverage
+      if: matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.9'
      uses: codecov/codecov-action@v3
      with:
        fail_ci_if_error: true
--- a/codecov.yml
+++ b/codecov.yml
@ -1,11 +0,0 @@
-codecov:
-  require_ci_to_pass: yes
-
-coverage:
-  status:
-    project:
-      default:
-        target: 0
-    patch:
-      default:
-        target: 70
--- a/misinformation/test/data/IMG_3755.jpg
+++ b/misinformation/test/data/IMG_3755.jpg
--- a/misinformation/test/data/IMG_3756.jpg
+++ b/misinformation/test/data/IMG_3756.jpg
--- a/misinformation/test/data/IMG_3757.jpg
+++ b/misinformation/test/data/IMG_3757.jpg
--- a/misinformation/test/data/text_IMG_3755.txt
+++ b/misinformation/test/data/text_IMG_3755.txt
@ -0,0 +1,5 @@
+Mathematische Formelsammlung
+für Ingenieure und Naturwissenschaftler
+Mit zahlreichen Abbildungen und Rechenbeispielen
+und einer ausführlichen Integraltafel
+3., verbesserte Auflage
--- a/misinformation/test/data/text_IMG_3756.txt
+++ b/misinformation/test/data/text_IMG_3756.txt
@ -0,0 +1,14 @@
+SCATTERING THEORY
+The Quantum Theory of
+Nonrelativistic Collisions
+JOHN R. TAYLOR
+University of Colorado
+ostaliga Lanbidean
+1 ilde
+ballenger stor goin
+gdĐOL, SIVI 23 TL 02
+de in obl
+och yd badalang
+a
+Ber
+ook Sy-RW enot go baldus
--- a/misinformation/test/data/text_IMG_3757.txt
+++ b/misinformation/test/data/text_IMG_3757.txt
@ -0,0 +1,10 @@
+THE
+ALGEBRAIC
+EIGENVALUE
+PROBLEM
+DOM
+NVS TIO
+MINA
+Monographs
+on Numerical Analysis
+J.. H. WILKINSON
--- a/misinformation/test/data/text_translated_IMG_3755.txt
+++ b/misinformation/test/data/text_translated_IMG_3755.txt
@ -0,0 +1,5 @@
+Mathematical Formula Collection
+for engineers and scientists
+With numerous illustrations and calculation examples
+and a detailed integral table
+3rd revised edition
--- a/misinformation/test/data/text_translated_IMG_3756.txt
+++ b/misinformation/test/data/text_translated_IMG_3756.txt
@ -0,0 +1,14 @@
+SCATTERING THEORY
+The Quantum Theory of
+Nonrelativistic Collisions
+JOHN R. TAYLOR
+University of Colorado
+ostaliga Lanbidean
+1 ilde
+balloons big goin
+gdĐOL, SIVI 23 TL
+there in obl
+och yd change
+a
+Ber
+ook Sy-RW isn't going anywhere
--- a/misinformation/test/data/text_translated_IMG_3757.txt
+++ b/misinformation/test/data/text_translated_IMG_3757.txt
@ -0,0 +1,10 @@
+THE
+ALGEBRAIC
+EIGENVALUE
+PROBLEM
+DOM
+NVS TIO
+MINA
+Monographs
+on Numerical Analysis
+J.. H. WILKINSON
--- a/misinformation/test/pytest.ini
+++ b/misinformation/test/pytest.ini
@ -1,3 +1,4 @@
 [pytest]
 markers =
-    imageai: mark a test related to imageai.
+    imageai: mark a test related to imageai.
+    gcv: mark google cloud vision tests - skip to save money.
--- a/misinformation/test/test_text.py
+++ b/misinformation/test/test_text.py
@ -0,0 +1,77 @@
+import os
+import pytest
+import spacy
+import misinformation.text as tt
+
+TESTDICT = {
+    "IMG_3755": {
+        "filename": "./test/data/IMG_3755.jpg",
+    },
+    "IMG_3756": {
+        "filename": "./test/data/IMG_3756.jpg",
+    },
+    "IMG_3757": {
+        "filename": "./test/data/IMG_3757.jpg",
+    },
+}
+
+LANGUAGES = ["de", "om", "en"]
+
+os.environ[
+    "GOOGLE_APPLICATION_CREDENTIALS"
+] = "../data/seismic-bonfire-329406-412821a70264.json"
+
+
+def test_TextDetector():
+    for item in TESTDICT:
+        test_obj = tt.TextDetector(TESTDICT[item])
+        assert test_obj.subdict["text"] is None
+        assert test_obj.subdict["text_language"] is None
+        assert test_obj.subdict["text_english"] is None
+        assert test_obj.subdict["text_cleaned"] is None
+
+
+@pytest.mark.gcv
+def test_get_text_from_image():
+    for item in TESTDICT:
+        test_obj = tt.TextDetector(TESTDICT[item])
+        test_obj.get_text_from_image()
+        ref_file = "./test/data/text_" + item + ".txt"
+        with open(ref_file, "r", encoding="utf8") as file:
+            reference_text = file.read()
+        assert test_obj.subdict["text"] == reference_text
+
+
+def test_translate_text():
+    for item, lang in zip(TESTDICT, LANGUAGES):
+        test_obj = tt.TextDetector(TESTDICT[item])
+        ref_file = "./test/data/text_" + item + ".txt"
+        trans_file = "./test/data/text_translated_" + item + ".txt"
+        with open(ref_file, "r", encoding="utf8") as file:
+            reference_text = file.read()
+        with open(trans_file, "r", encoding="utf8") as file:
+            translated_text = file.read()
+        test_obj.subdict["text"] = reference_text
+        test_obj.translate_text()
+        assert test_obj.subdict["text_language"] == lang
+        assert test_obj.subdict["text_english"] == translated_text
+
+
+def test_init_spacy():
+    test_obj = tt.TextDetector(TESTDICT["IMG_3755"])
+    ref_file = "./test/data/text_IMG_3755.txt"
+    with open(ref_file, "r") as file:
+        reference_text = file.read()
+    test_obj.subdict["text_english"] = reference_text
+    test_obj._init_spacy()
+    assert isinstance(test_obj.doc, spacy.tokens.doc.Doc)
+
+
+def test_clean_text():
+    nlp = spacy.load("en_core_web_md")
+    doc = nlp("I like cats and fjejg")
+    test_obj = tt.TextDetector(TESTDICT["IMG_3755"])
+    test_obj.doc = doc
+    test_obj.clean_text()
+    result = "I like cats and"
+    assert test_obj.subdict["text_clean"] == result
--- a/misinformation/text.py
+++ b/misinformation/text.py
@ -1,4 +1,6 @@
 from google.cloud import vision
+from googletrans import Translator
+import spacy
 import io
 from misinformation import utils

@ -7,29 +9,37 @@ class TextDetector(utils.AnalysisMethod):
    def __init__(self, subdict: dict) -> None:
        super().__init__(subdict)
        self.subdict.update(self.set_keys())
+        self.translator = Translator()
+        # spacy load should be separaate method with error if model not found / dynamic download
+        self.nlp = spacy.load("en_core_web_md")

    def set_keys(self) -> dict:
-        params = {"text": None}
+        params = {
+            "text": None,
+            "text_language": None,
+            "text_english": None,
+            "text_cleaned": None,
+        }
        return params

    def analyse_image(self):
-        """Detects text on the image."""
+        self.get_text_from_image()
+        self.translate_text()
+        self._init_spacy()
+        self.clean_text()
+        return self.subdict

+    def get_text_from_image(self):
+        """Detects text on the image."""
        path = self.subdict["filename"]
        client = vision.ImageAnnotatorClient()
-
        with io.open(path, "rb") as image_file:
            content = image_file.read()
-
        image = vision.Image(content=content)
-
        response = client.text_detection(image=image)
-        texts = response.text_annotations
+        texts = response.text_annotations[0].description
        # here check if text was found
-        self.subdict = {"text": []}
-        for text in texts:
-            self.subdict["text"].append(text.description)
-
+        self.subdict = {"text": texts}
        if response.error.message:
            raise Exception(
                "{}\nFor more info on error messages, check: "
@ -37,4 +47,21 @@ class TextDetector(utils.AnalysisMethod):
                    response.error.message
                )
            )
-        return self.subdict
+
+    def translate_text(self):
+        translated = self.translator.translate(self.subdict["text"])
+        self.subdict["text_language"] = translated.src
+        self.subdict["text_english"] = translated.text
+
+    def _init_spacy(self):
+        """Generate spacy doc object."""
+        self.doc = self.nlp(self.subdict["text_english"])
+
+    def clean_text(self):
+        """Clean the text from unrecognized words and any numbers."""
+        templist = []
+        for token in self.doc:
+            templist.append(
+                token.text
+            ) if token.pos_ != "NUM" and token.has_vector else None
+        self.subdict["text_clean"] = " ".join(templist).rstrip().lstrip()
--- a/notebooks/facial_expressions.ipynb
+++ b/notebooks/facial_expressions.ipynb
@ -42,7 +42,7 @@
   "outputs": [],
   "source": [
    "images = misinformation.find_files(\n",
-    "    path=\"/home/inga/projects/misinformation-project/misinformation/data/test_no_text/\",\n",
+    "    path=\"../data/test_no_text/\",\n",
    "    limit=1000,\n",
    ")"
   ]
@ -117,9 +117,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "image_ids = [key for key in mydict.keys()]\n",
-    "for i in image_ids:\n",
-    "    mydict[i] = misinformation.faces.EmotionDetector(mydict[i]).analyse_image()"
+    "for key in mydict.keys():\n",
+    "    mydict[key] = misinformation.faces.EmotionDetector(mydict[key]).analyse_image()"
   ]
  },
  {
@ -202,7 +201,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.4"
+   "version": "3.10.6"
  }
 },
 "nbformat": 4,
--- a/notebooks/get-text-from-image.ipynb
+++ b/notebooks/get-text-from-image.ipynb
@ -28,7 +28,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "images = misinformation.find_files(limit=1000)"
+    "images = misinformation.find_files(path=\"../data/images-little-text/\", limit=1000)"
   ]
  },
  {
@ -38,7 +38,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "for i in images:\n",
+    "for i in images[0:10]:\n",
    "    display(Image(filename=i))"
   ]
  },
@ -60,7 +60,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "for i in mysubfiles:\n",
+    "for i in mysubfiles[0:10]:\n",
    "    display(Image(filename=i))"
   ]
  },
@ -71,287 +71,17 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "mydict = misinformation.utils.initialize_dict(mysubfiles)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "07b7a7a3",
-   "metadata": {},
-   "source": [
-    "# Pre-process the images: Convert to greyscale and increase contrast"
+    "mydict = misinformation.utils.initialize_dict(images[0:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "4cacfb0d",
+   "id": "3be954ef-d31f-4e4d-857c-c14d5fda91f1",
   "metadata": {},
   "outputs": [],
   "source": [
-    "import cv2\n",
-    "from matplotlib import pyplot as plt\n",
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c94b02cb-9e96-4812-8448-8bc731bfd8aa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "! pip install matplotlib\n",
-    "! pip install numpy"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8a2d3057",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def preprocess(filename):\n",
-    "    \"\"\"Preprocess the image to enhance features for extraction.\"\"\"\n",
-    "    image = cv2.imread(filename)\n",
-    "    # preserve the original image\n",
-    "    #     original = image.copy()\n",
-    "    # Grayscale, Gaussian blur, Otsu's threshold\n",
-    "    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n",
-    "    # sharpen contrast by first smoothing and then substracting the smoothed and thresholded version\n",
-    "    sharpened = unsharp_mask(gray, amount=1.1, threshold=0.1)\n",
-    "    inverted = invert_image(sharpened)\n",
-    "    return gray, sharpened, inverted\n",
-    "\n",
-    "\n",
-    "# use unsharp mask algorithm from opencv\n",
-    "# https://docs.opencv.org/4.x/d1/d10/classcv_1_1MatExpr.html#details\n",
-    "def unsharp_mask(image, kernel_size=(5, 5), sigma=1.0, amount=1.0, threshold=0):\n",
-    "    \"\"\"Return a sharpened version of the image, using an unsharp mask.\n",
-    "    Amount: 1 is neutral, higher values result in shaprer images. threshold is the value below which the difference between blurred and original image gets discarded.\"\"\"\n",
-    "    blurred = cv2.GaussianBlur(image, kernel_size, sigma)\n",
-    "    sharpened = float(amount + 1) * image - float(amount) * blurred\n",
-    "    sharpened = np.maximum(sharpened, np.zeros(sharpened.shape))\n",
-    "    sharpened = np.minimum(sharpened, 255 * np.ones(sharpened.shape))\n",
-    "    sharpened = sharpened.round().astype(np.uint8)\n",
-    "    if threshold > 0:\n",
-    "        low_contrast_mask = np.absolute(image - blurred) < threshold\n",
-    "        np.copyto(sharpened, image, where=low_contrast_mask)\n",
-    "    return sharpened\n",
-    "\n",
-    "\n",
-    "def invert_image(image):\n",
-    "    return cv2.bitwise_not(image)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a483868b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "grey_image = []\n",
-    "for i in mysubfiles:\n",
-    "    grey_image.append(preprocess(i))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "08ed750d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for image in grey_image:\n",
-    "    # disable default colormap in imshow\n",
-    "    plt.imshow(image[0], cmap=\"gray\", vmin=0, vmax=255)\n",
-    "    plt.imshow(image[1], cmap=\"gray\", vmin=0, vmax=255)\n",
-    "    plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "71ac2229",
-   "metadata": {},
-   "source": [
-    "mabe further preprocess in cropping out text regions..?"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7786d09c",
-   "metadata": {},
-   "source": [
-    "# Try out different libraries\n",
-    "## The standard go-to tool that is slightly complicated: pytesseract\n",
-    "Install tesseract and the language libraries:\n",
-    "```\n",
-    "sudo apt install tesseract-ocr  \n",
-    "sudo apt install tesseract-ocr-all  \n",
-    "sudo apt install imagemagick  \n",
-    "```  "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0d69504c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pytesseract import pytesseract\n",
-    "\n",
-    "pytesseract.tesseract_cmd = r\"tesseract\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0b27c98c-b437-4c8b-8844-96d8718eea49",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "! pip install pytesseract"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "529de8d8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "myimage = grey_image[1]\n",
-    "plt.imshow(myimage[0], cmap=\"gray\", vmin=0, vmax=255)\n",
-    "plt.show()\n",
-    "\n",
-    "plt.imshow(myimage[1], cmap=\"gray\", vmin=0, vmax=255)\n",
-    "plt.show()\n",
-    "\n",
-    "plt.imshow(myimage[2], cmap=\"gray\", vmin=0, vmax=255)\n",
-    "plt.show()\n",
-    "\n",
-    "text = pytesseract.image_to_string(myimage[0])\n",
-    "print(text)\n",
-    "text = pytesseract.image_to_string(myimage[1])\n",
-    "print(text)\n",
-    "text = pytesseract.image_to_string(myimage[2])\n",
-    "print(text)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e25dd39e",
-   "metadata": {},
-   "source": [
-    "Here we probably would need to compare extractopm from different preprocessed images and overlay in a final text."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "36495f3f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for image in mysubfiles:\n",
-    "    # Loading image using OpenCV\n",
-    "    img = cv2.imread(image)\n",
-    "\n",
-    "    # Preprocessing image\n",
-    "    # Converting to grayscale\n",
-    "    gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
-    "\n",
-    "    # creating Binary image by selecting proper threshold\n",
-    "    binary_image = cv2.threshold(\n",
-    "        gray_image, 130, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU\n",
-    "    )[1]\n",
-    "\n",
-    "    # Inverting the image\n",
-    "    inverted_bin = cv2.bitwise_not(binary_image)\n",
-    "\n",
-    "    # Some noise reduction\n",
-    "    kernel = np.ones((2, 2), np.uint8)\n",
-    "    processed_img = cv2.erode(inverted_bin, kernel, iterations=1)\n",
-    "    processed_img = cv2.dilate(processed_img, kernel, iterations=1)\n",
-    "\n",
-    "    # Applying image_to_string method\n",
-    "    text = pytesseract.image_to_string(processed_img)\n",
-    "    plt.imshow(processed_img, cmap=\"gray\", vmin=0, vmax=255)\n",
-    "    plt.show()\n",
-    "    print(text)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d6532019",
-   "metadata": {},
-   "source": [
-    "## keras-ocr\n",
-    "Not sure how to create an image object without a url.\n",
-    "https://keras-ocr.readthedocs.io/en/latest/examples/using_pretrained_models.html"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "44e38871",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import keras_ocr\n",
-    "\n",
-    "pipeline = keras_ocr.pipeline.Pipeline()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2bb55068-ddd4-4b90-ae94-90181980d3c0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "! pip install keras-ocr"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0002f2c4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "images = [\n",
-    "    keras_ocr.tools.read(url)\n",
-    "    for url in [\n",
-    "        \"https://storage.googleapis.com/gcptutorials.com/examples/keras-ocr-img-1.jpg\",\n",
-    "        \"https://storage.googleapis.com/gcptutorials.com/examples/keras-ocr-img-2.png\",\n",
-    "    ]\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1567dc85",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "prediction_groups = pipeline.recognize(images)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fb1ca152",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "predicted_image_1 = prediction_groups[0]\n",
-    "for text, box in predicted_image_1:\n",
-    "    print(text)"
+    "mydict"
   ]
  },
  {
@ -359,10 +89,30 @@
   "id": "7b8b929f",
   "metadata": {},
   "source": [
-    "## google cloud vision API\n",
+    "# google cloud vision API\n",
    "First 1000 images per month are free."
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cbf74c0b-52fe-4fb8-b617-f18611e8f986",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\n",
+    "    \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
+    "] = \"../data/seismic-bonfire-329406-412821a70264.json\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0891b795-c7fe-454c-a45d-45fadf788142",
+   "metadata": {},
+   "source": [
+    "## Inspect the elements per image"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -370,19 +120,15 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "os.environ[\n",
-    "    \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
-    "] = \"/home/inga/projects/misinformation-project/misinformation-notes/seismic-bonfire-329406-412821a70264.json\"\n",
    "misinformation.explore_analysis(mydict, identify=\"text-on-image\")"
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "d54407ad",
+   "id": "9c3e72b5-0e57-4019-b45e-3e36a74e7f52",
   "metadata": {},
   "source": [
-    "## MS Azure\n",
-    "https://docs.microsoft.com/en-us/azure/search/cognitive-search-concept-image-scenarios"
+    "## Or directly analyze for further processing"
   ]
  },
  {
@ -391,6 +137,143 @@
   "id": "365c78b1-7ff4-4213-86fa-6a0a2d05198f",
   "metadata": {},
   "outputs": [],
+   "source": [
+    "for key in mydict:\n",
+    "    print(key)\n",
+    "    mydict[key] = misinformation.text.TextDetector(mydict[key]).analyse_image()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c75a2fff-9d59-4634-8d28-e90a909caa23",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c978fdb4-1f3a-4b78-b6ff-79c6e8a6fe82",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(mydict[\"104157S_eng\"][\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3724f8d8-be0f-44eb-8ff9-b425eee94efc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test = mydict[\"104157S_eng\"][\"text\"][0]\n",
+    "print(test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2fb865ab-6d16-4a4c-b004-9d39fcba6812",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# flake8-noqa-cell\n",
+    "from googletrans import Translator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "051ad473-8f75-40fc-ae90-7d8176cf816f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "translator = Translator()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b571d900-8829-4095-904f-dfee3ce46041",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = translator.translate(test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e1ff0b21-d2cc-4a50-8e86-a45362f1a0a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(result.text)\n",
+    "print(result.src)\n",
+    "print(result.origin)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e135f00e-cdd5-4931-8649-ba0b293e8bdd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# now clean the gibberish\n",
+    "# do spell check\n",
+    "# flake8-noqa-cell\n",
+    "import contextualSpellCheck\n",
+    "import spacy\n",
+    "\n",
+    "nlp = spacy.load(\"en_core_web_md\")\n",
+    "contextualSpellCheck.add_to_pipe(nlp)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01d17f6e-30ae-4cc1-ad03-11a3be6847c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "doc = nlp(result.text)\n",
+    "print(doc._.outcome_spellCheck)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0ef91dd2-70a8-4a04-b50f-5efba076bbb0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_text = []\n",
+    "for token in doc:\n",
+    "    if token.pos_ != \"SPACE\":\n",
+    "        if token.pos_ != \"NUM\":\n",
+    "            if token.has_vector:\n",
+    "                final_text.append(token.text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4d0a3360-3fd9-4ab3-b4dc-122e90c16c7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\" \".join(final_text))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1abe05e-0c2b-4769-8fa5-fc67297288ea",
+   "metadata": {},
+   "outputs": [],
   "source": []
  }
 ],
@ -410,7 +293,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.4"
+   "version": "3.10.6"
  },
  "vscode": {
   "interpreter": {
--- a/notebooks/objects_expression.ipynb
+++ b/notebooks/objects_expression.ipynb
@ -165,7 +165,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.4"
+   "version": "3.10.6"
  },
  "vscode": {
   "interpreter": {
@ -174,5 +174,5 @@
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@ -25,7 +25,7 @@ dependencies = [
    "cvlib",
    "deepface",
    "ipywidgets",
-    "numpy",
+    "numpy<=1.23.4",
    "opencv_python",
    "pandas",
    "pooch",
@ -40,6 +40,8 @@ dependencies = [
    "matplotlib",
    "pytest",
    "opencv-contrib-python",
+    "googletrans==3.1.0a0",
+    "spacy",
 ]

 [project.scripts]
--- a/requirements.txt
+++ b/requirements.txt
@ -2,7 +2,7 @@ google-cloud-vision
 cvlib
 deepface
 ipywidgets
-numpy
+numpy<=1.23.4
 opencv_python
 pandas
 pooch
@ -16,3 +16,6 @@ pytest
 pytest-cov
 matplotlib
 opencv-contrib-python
+googletrans==3.1.0a0
+spacy
+https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1.tar.gz