diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 239de40..78fa7a2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,12 +14,11 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-20.04, windows-latest] + os: [ubuntu-22.04, windows-latest] python-version: [3.9] steps: - name: Checkout repository uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: @@ -28,11 +27,13 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install -e . + python -m spacy download en_core_web_md - name: Run pytest run: | cd misinformation - python -m pytest -s -m "not imageai" --cov=. --cov-report=xml + python -m pytest -s -m "not (imageai or gcv)" --cov=. --cov-report=xml - name: Upload coverage + if: matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.9' uses: codecov/codecov-action@v3 with: fail_ci_if_error: true diff --git a/codecov.yml b/codecov.yml deleted file mode 100644 index 78b6eda..0000000 --- a/codecov.yml +++ /dev/null @@ -1,11 +0,0 @@ -codecov: - require_ci_to_pass: yes - -coverage: - status: - project: - default: - target: 0 - patch: - default: - target: 70 diff --git a/misinformation/test/data/IMG_3755.jpg b/misinformation/test/data/IMG_3755.jpg new file mode 100644 index 0000000..b2b3af2 Binary files /dev/null and b/misinformation/test/data/IMG_3755.jpg differ diff --git a/misinformation/test/data/IMG_3756.jpg b/misinformation/test/data/IMG_3756.jpg new file mode 100644 index 0000000..0552142 Binary files /dev/null and b/misinformation/test/data/IMG_3756.jpg differ diff --git a/misinformation/test/data/IMG_3757.jpg b/misinformation/test/data/IMG_3757.jpg new file mode 100644 index 0000000..f7a50c6 Binary files /dev/null and b/misinformation/test/data/IMG_3757.jpg differ diff --git a/misinformation/test/data/text_IMG_3755.txt b/misinformation/test/data/text_IMG_3755.txt new file mode 100644 index 0000000..f42ffd7 --- /dev/null +++ b/misinformation/test/data/text_IMG_3755.txt @@ -0,0 +1,5 @@ +Mathematische Formelsammlung +für Ingenieure und Naturwissenschaftler +Mit zahlreichen Abbildungen und Rechenbeispielen +und einer ausführlichen Integraltafel +3., verbesserte Auflage \ No newline at end of file diff --git a/misinformation/test/data/text_IMG_3756.txt b/misinformation/test/data/text_IMG_3756.txt new file mode 100644 index 0000000..49690b1 --- /dev/null +++ b/misinformation/test/data/text_IMG_3756.txt @@ -0,0 +1,14 @@ +SCATTERING THEORY +The Quantum Theory of +Nonrelativistic Collisions +JOHN R. TAYLOR +University of Colorado +ostaliga Lanbidean +1 ilde +ballenger stor goin +gdĐOL, SIVI 23 TL 02 +de in obl +och yd badalang +a +Ber +ook Sy-RW enot go baldus \ No newline at end of file diff --git a/misinformation/test/data/text_IMG_3757.txt b/misinformation/test/data/text_IMG_3757.txt new file mode 100644 index 0000000..6fee303 --- /dev/null +++ b/misinformation/test/data/text_IMG_3757.txt @@ -0,0 +1,10 @@ +THE +ALGEBRAIC +EIGENVALUE +PROBLEM +DOM +NVS TIO +MINA +Monographs +on Numerical Analysis +J.. H. WILKINSON \ No newline at end of file diff --git a/misinformation/test/data/text_translated_IMG_3755.txt b/misinformation/test/data/text_translated_IMG_3755.txt new file mode 100644 index 0000000..2d545ae --- /dev/null +++ b/misinformation/test/data/text_translated_IMG_3755.txt @@ -0,0 +1,5 @@ +Mathematical Formula Collection +for engineers and scientists +With numerous illustrations and calculation examples +and a detailed integral table +3rd revised edition \ No newline at end of file diff --git a/misinformation/test/data/text_translated_IMG_3756.txt b/misinformation/test/data/text_translated_IMG_3756.txt new file mode 100644 index 0000000..04479ee --- /dev/null +++ b/misinformation/test/data/text_translated_IMG_3756.txt @@ -0,0 +1,14 @@ +SCATTERING THEORY +The Quantum Theory of +Nonrelativistic Collisions +JOHN R. TAYLOR +University of Colorado +ostaliga Lanbidean +1 ilde +balloons big goin +gdĐOL, SIVI 23 TL +there in obl +och yd change +a +Ber +ook Sy-RW isn't going anywhere \ No newline at end of file diff --git a/misinformation/test/data/text_translated_IMG_3757.txt b/misinformation/test/data/text_translated_IMG_3757.txt new file mode 100644 index 0000000..6fee303 --- /dev/null +++ b/misinformation/test/data/text_translated_IMG_3757.txt @@ -0,0 +1,10 @@ +THE +ALGEBRAIC +EIGENVALUE +PROBLEM +DOM +NVS TIO +MINA +Monographs +on Numerical Analysis +J.. H. WILKINSON \ No newline at end of file diff --git a/misinformation/test/pytest.ini b/misinformation/test/pytest.ini index 6a6a1f5..c1d744a 100644 --- a/misinformation/test/pytest.ini +++ b/misinformation/test/pytest.ini @@ -1,3 +1,4 @@ [pytest] markers = - imageai: mark a test related to imageai. \ No newline at end of file + imageai: mark a test related to imageai. + gcv: mark google cloud vision tests - skip to save money. \ No newline at end of file diff --git a/misinformation/test/test_text.py b/misinformation/test/test_text.py new file mode 100644 index 0000000..73211a6 --- /dev/null +++ b/misinformation/test/test_text.py @@ -0,0 +1,77 @@ +import os +import pytest +import spacy +import misinformation.text as tt + +TESTDICT = { + "IMG_3755": { + "filename": "./test/data/IMG_3755.jpg", + }, + "IMG_3756": { + "filename": "./test/data/IMG_3756.jpg", + }, + "IMG_3757": { + "filename": "./test/data/IMG_3757.jpg", + }, +} + +LANGUAGES = ["de", "om", "en"] + +os.environ[ + "GOOGLE_APPLICATION_CREDENTIALS" +] = "../data/seismic-bonfire-329406-412821a70264.json" + + +def test_TextDetector(): + for item in TESTDICT: + test_obj = tt.TextDetector(TESTDICT[item]) + assert test_obj.subdict["text"] is None + assert test_obj.subdict["text_language"] is None + assert test_obj.subdict["text_english"] is None + assert test_obj.subdict["text_cleaned"] is None + + +@pytest.mark.gcv +def test_get_text_from_image(): + for item in TESTDICT: + test_obj = tt.TextDetector(TESTDICT[item]) + test_obj.get_text_from_image() + ref_file = "./test/data/text_" + item + ".txt" + with open(ref_file, "r", encoding="utf8") as file: + reference_text = file.read() + assert test_obj.subdict["text"] == reference_text + + +def test_translate_text(): + for item, lang in zip(TESTDICT, LANGUAGES): + test_obj = tt.TextDetector(TESTDICT[item]) + ref_file = "./test/data/text_" + item + ".txt" + trans_file = "./test/data/text_translated_" + item + ".txt" + with open(ref_file, "r", encoding="utf8") as file: + reference_text = file.read() + with open(trans_file, "r", encoding="utf8") as file: + translated_text = file.read() + test_obj.subdict["text"] = reference_text + test_obj.translate_text() + assert test_obj.subdict["text_language"] == lang + assert test_obj.subdict["text_english"] == translated_text + + +def test_init_spacy(): + test_obj = tt.TextDetector(TESTDICT["IMG_3755"]) + ref_file = "./test/data/text_IMG_3755.txt" + with open(ref_file, "r") as file: + reference_text = file.read() + test_obj.subdict["text_english"] = reference_text + test_obj._init_spacy() + assert isinstance(test_obj.doc, spacy.tokens.doc.Doc) + + +def test_clean_text(): + nlp = spacy.load("en_core_web_md") + doc = nlp("I like cats and fjejg") + test_obj = tt.TextDetector(TESTDICT["IMG_3755"]) + test_obj.doc = doc + test_obj.clean_text() + result = "I like cats and" + assert test_obj.subdict["text_clean"] == result diff --git a/misinformation/text.py b/misinformation/text.py index dc34728..3c5ab71 100644 --- a/misinformation/text.py +++ b/misinformation/text.py @@ -1,4 +1,6 @@ from google.cloud import vision +from googletrans import Translator +import spacy import io from misinformation import utils @@ -7,29 +9,37 @@ class TextDetector(utils.AnalysisMethod): def __init__(self, subdict: dict) -> None: super().__init__(subdict) self.subdict.update(self.set_keys()) + self.translator = Translator() + # spacy load should be separaate method with error if model not found / dynamic download + self.nlp = spacy.load("en_core_web_md") def set_keys(self) -> dict: - params = {"text": None} + params = { + "text": None, + "text_language": None, + "text_english": None, + "text_cleaned": None, + } return params def analyse_image(self): - """Detects text on the image.""" + self.get_text_from_image() + self.translate_text() + self._init_spacy() + self.clean_text() + return self.subdict + def get_text_from_image(self): + """Detects text on the image.""" path = self.subdict["filename"] client = vision.ImageAnnotatorClient() - with io.open(path, "rb") as image_file: content = image_file.read() - image = vision.Image(content=content) - response = client.text_detection(image=image) - texts = response.text_annotations + texts = response.text_annotations[0].description # here check if text was found - self.subdict = {"text": []} - for text in texts: - self.subdict["text"].append(text.description) - + self.subdict = {"text": texts} if response.error.message: raise Exception( "{}\nFor more info on error messages, check: " @@ -37,4 +47,21 @@ class TextDetector(utils.AnalysisMethod): response.error.message ) ) - return self.subdict + + def translate_text(self): + translated = self.translator.translate(self.subdict["text"]) + self.subdict["text_language"] = translated.src + self.subdict["text_english"] = translated.text + + def _init_spacy(self): + """Generate spacy doc object.""" + self.doc = self.nlp(self.subdict["text_english"]) + + def clean_text(self): + """Clean the text from unrecognized words and any numbers.""" + templist = [] + for token in self.doc: + templist.append( + token.text + ) if token.pos_ != "NUM" and token.has_vector else None + self.subdict["text_clean"] = " ".join(templist).rstrip().lstrip() diff --git a/notebooks/facial_expressions.ipynb b/notebooks/facial_expressions.ipynb index 3117ee7..a0d93a5 100644 --- a/notebooks/facial_expressions.ipynb +++ b/notebooks/facial_expressions.ipynb @@ -42,7 +42,7 @@ "outputs": [], "source": [ "images = misinformation.find_files(\n", - " path=\"/home/inga/projects/misinformation-project/misinformation/data/test_no_text/\",\n", + " path=\"../data/test_no_text/\",\n", " limit=1000,\n", ")" ] @@ -117,9 +117,8 @@ "metadata": {}, "outputs": [], "source": [ - "image_ids = [key for key in mydict.keys()]\n", - "for i in image_ids:\n", - " mydict[i] = misinformation.faces.EmotionDetector(mydict[i]).analyse_image()" + "for key in mydict.keys():\n", + " mydict[key] = misinformation.faces.EmotionDetector(mydict[key]).analyse_image()" ] }, { @@ -202,7 +201,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.4" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/notebooks/get-text-from-image.ipynb b/notebooks/get-text-from-image.ipynb index 1e8d485..7276073 100644 --- a/notebooks/get-text-from-image.ipynb +++ b/notebooks/get-text-from-image.ipynb @@ -28,7 +28,7 @@ "metadata": {}, "outputs": [], "source": [ - "images = misinformation.find_files(limit=1000)" + "images = misinformation.find_files(path=\"../data/images-little-text/\", limit=1000)" ] }, { @@ -38,7 +38,7 @@ "metadata": {}, "outputs": [], "source": [ - "for i in images:\n", + "for i in images[0:10]:\n", " display(Image(filename=i))" ] }, @@ -60,7 +60,7 @@ "metadata": {}, "outputs": [], "source": [ - "for i in mysubfiles:\n", + "for i in mysubfiles[0:10]:\n", " display(Image(filename=i))" ] }, @@ -71,287 +71,17 @@ "metadata": {}, "outputs": [], "source": [ - "mydict = misinformation.utils.initialize_dict(mysubfiles)" - ] - }, - { - "cell_type": "markdown", - "id": "07b7a7a3", - "metadata": {}, - "source": [ - "# Pre-process the images: Convert to greyscale and increase contrast" + "mydict = misinformation.utils.initialize_dict(images[0:10])" ] }, { "cell_type": "code", "execution_count": null, - "id": "4cacfb0d", + "id": "3be954ef-d31f-4e4d-857c-c14d5fda91f1", "metadata": {}, "outputs": [], "source": [ - "import cv2\n", - "from matplotlib import pyplot as plt\n", - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c94b02cb-9e96-4812-8448-8bc731bfd8aa", - "metadata": {}, - "outputs": [], - "source": [ - "! pip install matplotlib\n", - "! pip install numpy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a2d3057", - "metadata": {}, - "outputs": [], - "source": [ - "def preprocess(filename):\n", - " \"\"\"Preprocess the image to enhance features for extraction.\"\"\"\n", - " image = cv2.imread(filename)\n", - " # preserve the original image\n", - " # original = image.copy()\n", - " # Grayscale, Gaussian blur, Otsu's threshold\n", - " gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n", - " # sharpen contrast by first smoothing and then substracting the smoothed and thresholded version\n", - " sharpened = unsharp_mask(gray, amount=1.1, threshold=0.1)\n", - " inverted = invert_image(sharpened)\n", - " return gray, sharpened, inverted\n", - "\n", - "\n", - "# use unsharp mask algorithm from opencv\n", - "# https://docs.opencv.org/4.x/d1/d10/classcv_1_1MatExpr.html#details\n", - "def unsharp_mask(image, kernel_size=(5, 5), sigma=1.0, amount=1.0, threshold=0):\n", - " \"\"\"Return a sharpened version of the image, using an unsharp mask.\n", - " Amount: 1 is neutral, higher values result in shaprer images. threshold is the value below which the difference between blurred and original image gets discarded.\"\"\"\n", - " blurred = cv2.GaussianBlur(image, kernel_size, sigma)\n", - " sharpened = float(amount + 1) * image - float(amount) * blurred\n", - " sharpened = np.maximum(sharpened, np.zeros(sharpened.shape))\n", - " sharpened = np.minimum(sharpened, 255 * np.ones(sharpened.shape))\n", - " sharpened = sharpened.round().astype(np.uint8)\n", - " if threshold > 0:\n", - " low_contrast_mask = np.absolute(image - blurred) < threshold\n", - " np.copyto(sharpened, image, where=low_contrast_mask)\n", - " return sharpened\n", - "\n", - "\n", - "def invert_image(image):\n", - " return cv2.bitwise_not(image)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a483868b", - "metadata": {}, - "outputs": [], - "source": [ - "grey_image = []\n", - "for i in mysubfiles:\n", - " grey_image.append(preprocess(i))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "08ed750d", - "metadata": {}, - "outputs": [], - "source": [ - "for image in grey_image:\n", - " # disable default colormap in imshow\n", - " plt.imshow(image[0], cmap=\"gray\", vmin=0, vmax=255)\n", - " plt.imshow(image[1], cmap=\"gray\", vmin=0, vmax=255)\n", - " plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "71ac2229", - "metadata": {}, - "source": [ - "mabe further preprocess in cropping out text regions..?" - ] - }, - { - "cell_type": "markdown", - "id": "7786d09c", - "metadata": {}, - "source": [ - "# Try out different libraries\n", - "## The standard go-to tool that is slightly complicated: pytesseract\n", - "Install tesseract and the language libraries:\n", - "```\n", - "sudo apt install tesseract-ocr \n", - "sudo apt install tesseract-ocr-all \n", - "sudo apt install imagemagick \n", - "``` " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d69504c", - "metadata": {}, - "outputs": [], - "source": [ - "from pytesseract import pytesseract\n", - "\n", - "pytesseract.tesseract_cmd = r\"tesseract\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b27c98c-b437-4c8b-8844-96d8718eea49", - "metadata": {}, - "outputs": [], - "source": [ - "! pip install pytesseract" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "529de8d8", - "metadata": {}, - "outputs": [], - "source": [ - "myimage = grey_image[1]\n", - "plt.imshow(myimage[0], cmap=\"gray\", vmin=0, vmax=255)\n", - "plt.show()\n", - "\n", - "plt.imshow(myimage[1], cmap=\"gray\", vmin=0, vmax=255)\n", - "plt.show()\n", - "\n", - "plt.imshow(myimage[2], cmap=\"gray\", vmin=0, vmax=255)\n", - "plt.show()\n", - "\n", - "text = pytesseract.image_to_string(myimage[0])\n", - "print(text)\n", - "text = pytesseract.image_to_string(myimage[1])\n", - "print(text)\n", - "text = pytesseract.image_to_string(myimage[2])\n", - "print(text)" - ] - }, - { - "cell_type": "markdown", - "id": "e25dd39e", - "metadata": {}, - "source": [ - "Here we probably would need to compare extractopm from different preprocessed images and overlay in a final text." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "36495f3f", - "metadata": {}, - "outputs": [], - "source": [ - "for image in mysubfiles:\n", - " # Loading image using OpenCV\n", - " img = cv2.imread(image)\n", - "\n", - " # Preprocessing image\n", - " # Converting to grayscale\n", - " gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n", - "\n", - " # creating Binary image by selecting proper threshold\n", - " binary_image = cv2.threshold(\n", - " gray_image, 130, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU\n", - " )[1]\n", - "\n", - " # Inverting the image\n", - " inverted_bin = cv2.bitwise_not(binary_image)\n", - "\n", - " # Some noise reduction\n", - " kernel = np.ones((2, 2), np.uint8)\n", - " processed_img = cv2.erode(inverted_bin, kernel, iterations=1)\n", - " processed_img = cv2.dilate(processed_img, kernel, iterations=1)\n", - "\n", - " # Applying image_to_string method\n", - " text = pytesseract.image_to_string(processed_img)\n", - " plt.imshow(processed_img, cmap=\"gray\", vmin=0, vmax=255)\n", - " plt.show()\n", - " print(text)" - ] - }, - { - "cell_type": "markdown", - "id": "d6532019", - "metadata": {}, - "source": [ - "## keras-ocr\n", - "Not sure how to create an image object without a url.\n", - "https://keras-ocr.readthedocs.io/en/latest/examples/using_pretrained_models.html" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "44e38871", - "metadata": {}, - "outputs": [], - "source": [ - "import keras_ocr\n", - "\n", - "pipeline = keras_ocr.pipeline.Pipeline()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2bb55068-ddd4-4b90-ae94-90181980d3c0", - "metadata": {}, - "outputs": [], - "source": [ - "! pip install keras-ocr" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0002f2c4", - "metadata": {}, - "outputs": [], - "source": [ - "images = [\n", - " keras_ocr.tools.read(url)\n", - " for url in [\n", - " \"https://storage.googleapis.com/gcptutorials.com/examples/keras-ocr-img-1.jpg\",\n", - " \"https://storage.googleapis.com/gcptutorials.com/examples/keras-ocr-img-2.png\",\n", - " ]\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1567dc85", - "metadata": {}, - "outputs": [], - "source": [ - "prediction_groups = pipeline.recognize(images)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fb1ca152", - "metadata": {}, - "outputs": [], - "source": [ - "predicted_image_1 = prediction_groups[0]\n", - "for text, box in predicted_image_1:\n", - " print(text)" + "mydict" ] }, { @@ -359,10 +89,30 @@ "id": "7b8b929f", "metadata": {}, "source": [ - "## google cloud vision API\n", + "# google cloud vision API\n", "First 1000 images per month are free." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbf74c0b-52fe-4fb8-b617-f18611e8f986", + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\n", + " \"GOOGLE_APPLICATION_CREDENTIALS\"\n", + "] = \"../data/seismic-bonfire-329406-412821a70264.json\"" + ] + }, + { + "cell_type": "markdown", + "id": "0891b795-c7fe-454c-a45d-45fadf788142", + "metadata": {}, + "source": [ + "## Inspect the elements per image" + ] + }, { "cell_type": "code", "execution_count": null, @@ -370,19 +120,15 @@ "metadata": {}, "outputs": [], "source": [ - "os.environ[\n", - " \"GOOGLE_APPLICATION_CREDENTIALS\"\n", - "] = \"/home/inga/projects/misinformation-project/misinformation-notes/seismic-bonfire-329406-412821a70264.json\"\n", "misinformation.explore_analysis(mydict, identify=\"text-on-image\")" ] }, { "cell_type": "markdown", - "id": "d54407ad", + "id": "9c3e72b5-0e57-4019-b45e-3e36a74e7f52", "metadata": {}, "source": [ - "## MS Azure\n", - "https://docs.microsoft.com/en-us/azure/search/cognitive-search-concept-image-scenarios" + "## Or directly analyze for further processing" ] }, { @@ -391,6 +137,143 @@ "id": "365c78b1-7ff4-4213-86fa-6a0a2d05198f", "metadata": {}, "outputs": [], + "source": [ + "for key in mydict:\n", + " print(key)\n", + " mydict[key] = misinformation.text.TextDetector(mydict[key]).analyse_image()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c75a2fff-9d59-4634-8d28-e90a909caa23", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c978fdb4-1f3a-4b78-b6ff-79c6e8a6fe82", + "metadata": {}, + "outputs": [], + "source": [ + "print(mydict[\"104157S_eng\"][\"text\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3724f8d8-be0f-44eb-8ff9-b425eee94efc", + "metadata": {}, + "outputs": [], + "source": [ + "test = mydict[\"104157S_eng\"][\"text\"][0]\n", + "print(test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fb865ab-6d16-4a4c-b004-9d39fcba6812", + "metadata": {}, + "outputs": [], + "source": [ + "# flake8-noqa-cell\n", + "from googletrans import Translator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "051ad473-8f75-40fc-ae90-7d8176cf816f", + "metadata": {}, + "outputs": [], + "source": [ + "translator = Translator()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b571d900-8829-4095-904f-dfee3ce46041", + "metadata": {}, + "outputs": [], + "source": [ + "result = translator.translate(test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1ff0b21-d2cc-4a50-8e86-a45362f1a0a1", + "metadata": {}, + "outputs": [], + "source": [ + "print(result.text)\n", + "print(result.src)\n", + "print(result.origin)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e135f00e-cdd5-4931-8649-ba0b293e8bdd", + "metadata": {}, + "outputs": [], + "source": [ + "# now clean the gibberish\n", + "# do spell check\n", + "# flake8-noqa-cell\n", + "import contextualSpellCheck\n", + "import spacy\n", + "\n", + "nlp = spacy.load(\"en_core_web_md\")\n", + "contextualSpellCheck.add_to_pipe(nlp)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01d17f6e-30ae-4cc1-ad03-11a3be6847c2", + "metadata": {}, + "outputs": [], + "source": [ + "doc = nlp(result.text)\n", + "print(doc._.outcome_spellCheck)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ef91dd2-70a8-4a04-b50f-5efba076bbb0", + "metadata": {}, + "outputs": [], + "source": [ + "final_text = []\n", + "for token in doc:\n", + " if token.pos_ != \"SPACE\":\n", + " if token.pos_ != \"NUM\":\n", + " if token.has_vector:\n", + " final_text.append(token.text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d0a3360-3fd9-4ab3-b4dc-122e90c16c7c", + "metadata": {}, + "outputs": [], + "source": [ + "print(\" \".join(final_text))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1abe05e-0c2b-4769-8fa5-fc67297288ea", + "metadata": {}, + "outputs": [], "source": [] } ], @@ -410,7 +293,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.4" + "version": "3.10.6" }, "vscode": { "interpreter": { diff --git a/notebooks/objects_expression.ipynb b/notebooks/objects_expression.ipynb index 2bc71db..6a92294 100644 --- a/notebooks/objects_expression.ipynb +++ b/notebooks/objects_expression.ipynb @@ -165,7 +165,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.4" + "version": "3.10.6" }, "vscode": { "interpreter": { @@ -174,5 +174,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/pyproject.toml b/pyproject.toml index 65bff80..91f7a01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "cvlib", "deepface", "ipywidgets", - "numpy", + "numpy<=1.23.4", "opencv_python", "pandas", "pooch", @@ -40,6 +40,8 @@ dependencies = [ "matplotlib", "pytest", "opencv-contrib-python", + "googletrans==3.1.0a0", + "spacy", ] [project.scripts] diff --git a/requirements.txt b/requirements.txt index 04d073a..9bb7582 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ google-cloud-vision cvlib deepface ipywidgets -numpy +numpy<=1.23.4 opencv_python pandas pooch @@ -16,3 +16,6 @@ pytest pytest-cov matplotlib opencv-contrib-python +googletrans==3.1.0a0 +spacy +https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1.tar.gz \ No newline at end of file