{ "cells": [ { "cell_type": "markdown", "id": "dcaa3da1", "metadata": {}, "source": [ "# Notebook for text extraction on image\n", "Inga Ulusoy, SSC, July 2022" ] }, { "cell_type": "code", "execution_count": null, "id": "cf362e60", "metadata": {}, "outputs": [], "source": [ "import os\n", "from IPython.display import Image, display\n", "import misinformation" ] }, { "cell_type": "code", "execution_count": null, "id": "6da3a7aa", "metadata": {}, "outputs": [], "source": [ "images = misinformation.find_files(limit=1000)" ] }, { "cell_type": "code", "execution_count": null, "id": "bf811ce0", "metadata": {}, "outputs": [], "source": [ "for i in images:\n", " display(Image(filename=i))" ] }, { "cell_type": "code", "execution_count": null, "id": "366e2060", "metadata": {}, "outputs": [], "source": [ "# start with only English\n", "mysubfiles = [i for i in images if \"eng\" in i]" ] }, { "cell_type": "code", "execution_count": null, "id": "b330b267", "metadata": {}, "outputs": [], "source": [ "for i in mysubfiles:\n", " display(Image(filename=i))" ] }, { "cell_type": "markdown", "id": "07b7a7a3", "metadata": {}, "source": [ "# Pre-process the images: Convert to greyscale and increase contrast" ] }, { "cell_type": "code", "execution_count": null, "id": "4cacfb0d", "metadata": {}, "outputs": [], "source": [ "import cv2\n", "from matplotlib import pyplot as plt\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "id": "c94b02cb-9e96-4812-8448-8bc731bfd8aa", "metadata": {}, "outputs": [], "source": [ "! pip install matplotlib\n", "! pip install numpy" ] }, { "cell_type": "code", "execution_count": null, "id": "8a2d3057", "metadata": {}, "outputs": [], "source": [ "def preprocess(filename):\n", " \"\"\"Preprocess the image to enhance features for extraction.\"\"\"\n", " image = cv2.imread(filename)\n", " # preserve the original image\n", " original = image.copy()\n", " # Grayscale, Gaussian blur, Otsu's threshold\n", " gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n", " # sharpen contrast by first smoothing and then substracting the smoothed and thresholded version\n", " sharpened = unsharp_mask(gray, amount=1.1, threshold=0.1)\n", " inverted = invert_image(sharpened)\n", " return gray, sharpened, inverted\n", "\n", "\n", "# use unsharp mask algorithm from opencv\n", "# https://docs.opencv.org/4.x/d1/d10/classcv_1_1MatExpr.html#details\n", "def unsharp_mask(image, kernel_size=(5, 5), sigma=1.0, amount=1.0, threshold=0):\n", " \"\"\"Return a sharpened version of the image, using an unsharp mask.\n", " Amount: 1 is neutral, higher values result in shaprer images. threshold is the value below which the difference between blurred and original image gets discarded.\"\"\"\n", " blurred = cv2.GaussianBlur(image, kernel_size, sigma)\n", " sharpened = float(amount + 1) * image - float(amount) * blurred\n", " sharpened = np.maximum(sharpened, np.zeros(sharpened.shape))\n", " sharpened = np.minimum(sharpened, 255 * np.ones(sharpened.shape))\n", " sharpened = sharpened.round().astype(np.uint8)\n", " if threshold > 0:\n", " low_contrast_mask = np.absolute(image - blurred) < threshold\n", " np.copyto(sharpened, image, where=low_contrast_mask)\n", " return sharpened\n", "\n", "\n", "def invert_image(image):\n", " return cv2.bitwise_not(image)" ] }, { "cell_type": "code", "execution_count": null, "id": "a483868b", "metadata": {}, "outputs": [], "source": [ "grey_image = []\n", "for i in mysubfiles:\n", " grey_image.append(preprocess(i))" ] }, { "cell_type": "code", "execution_count": null, "id": "08ed750d", "metadata": {}, "outputs": [], "source": [ "for image in grey_image:\n", " # disable default colormap in imshow\n", " plt.imshow(image[0], cmap=\"gray\", vmin=0, vmax=255)\n", " plt.imshow(image[1], cmap=\"gray\", vmin=0, vmax=255)\n", " plt.show()" ] }, { "cell_type": "markdown", "id": "71ac2229", "metadata": {}, "source": [ "mabe further preprocess in cropping out text regions..?" ] }, { "cell_type": "markdown", "id": "7786d09c", "metadata": {}, "source": [ "# Try out different libraries\n", "## The standard go-to tool that is slightly complicated: pytesseract\n", "Install tesseract and the language libraries:\n", "```\n", "sudo apt install tesseract-ocr \n", "sudo apt install tesseract-ocr-all \n", "sudo apt install imagemagick \n", "``` " ] }, { "cell_type": "code", "execution_count": null, "id": "0d69504c", "metadata": {}, "outputs": [], "source": [ "from pytesseract import pytesseract\n", "\n", "pytesseract.tesseract_cmd = r\"tesseract\"" ] }, { "cell_type": "code", "execution_count": null, "id": "0b27c98c-b437-4c8b-8844-96d8718eea49", "metadata": {}, "outputs": [], "source": [ "! pip install pytesseract" ] }, { "cell_type": "code", "execution_count": null, "id": "529de8d8", "metadata": {}, "outputs": [], "source": [ "myimage = grey_image[1]\n", "plt.imshow(myimage[0], cmap=\"gray\", vmin=0, vmax=255)\n", "plt.show()\n", "\n", "plt.imshow(myimage[1], cmap=\"gray\", vmin=0, vmax=255)\n", "plt.show()\n", "\n", "plt.imshow(myimage[2], cmap=\"gray\", vmin=0, vmax=255)\n", "plt.show()\n", "\n", "text = pytesseract.image_to_string(myimage[0])\n", "print(text)\n", "text = pytesseract.image_to_string(myimage[1])\n", "print(text)\n", "text = pytesseract.image_to_string(myimage[2])\n", "print(text)" ] }, { "cell_type": "markdown", "id": "e25dd39e", "metadata": {}, "source": [ "Here we probably would need to compare extractopm from different preprocessed images and overlay in a final text." ] }, { "cell_type": "code", "execution_count": null, "id": "36495f3f", "metadata": {}, "outputs": [], "source": [ "for image in mysubfiles:\n", " # Loading image using OpenCV\n", " img = cv2.imread(image)\n", "\n", " # Preprocessing image\n", " # Converting to grayscale\n", " gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n", "\n", " # creating Binary image by selecting proper threshold\n", " binary_image = cv2.threshold(\n", " gray_image, 130, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU\n", " )[1]\n", "\n", " # Inverting the image\n", " inverted_bin = cv2.bitwise_not(binary_image)\n", "\n", " # Some noise reduction\n", " kernel = np.ones((2, 2), np.uint8)\n", " processed_img = cv2.erode(inverted_bin, kernel, iterations=1)\n", " processed_img = cv2.dilate(processed_img, kernel, iterations=1)\n", "\n", " # Applying image_to_string method\n", " text = pytesseract.image_to_string(processed_img)\n", " plt.imshow(processed_img, cmap=\"gray\", vmin=0, vmax=255)\n", " plt.show()\n", " print(text)" ] }, { "cell_type": "markdown", "id": "d6532019", "metadata": {}, "source": [ "## keras-ocr\n", "Not sure how to create an image object without a url.\n", "https://keras-ocr.readthedocs.io/en/latest/examples/using_pretrained_models.html" ] }, { "cell_type": "code", "execution_count": null, "id": "44e38871", "metadata": {}, "outputs": [], "source": [ "import keras_ocr\n", "\n", "pipeline = keras_ocr.pipeline.Pipeline()" ] }, { "cell_type": "code", "execution_count": null, "id": "2bb55068-ddd4-4b90-ae94-90181980d3c0", "metadata": {}, "outputs": [], "source": [ "! pip install keras-ocr" ] }, { "cell_type": "code", "execution_count": null, "id": "0002f2c4", "metadata": {}, "outputs": [], "source": [ "images = [\n", " keras_ocr.tools.read(url)\n", " for url in [\n", " \"https://storage.googleapis.com/gcptutorials.com/examples/keras-ocr-img-1.jpg\",\n", " \"https://storage.googleapis.com/gcptutorials.com/examples/keras-ocr-img-2.png\",\n", " ]\n", "]" ] }, { "cell_type": "code", "execution_count": null, "id": "1567dc85", "metadata": {}, "outputs": [], "source": [ "prediction_groups = pipeline.recognize(images)" ] }, { "cell_type": "code", "execution_count": null, "id": "fb1ca152", "metadata": {}, "outputs": [], "source": [ "predicted_image_1 = prediction_groups[0]\n", "for text, box in predicted_image_1:\n", " print(text)" ] }, { "cell_type": "markdown", "id": "7b8b929f", "metadata": {}, "source": [ "## google cloud vision API\n", "First 1000 images per month are free." ] }, { "cell_type": "code", "execution_count": null, "id": "7c6ecc88", "metadata": {}, "outputs": [], "source": [ "os.environ[\n", " \"GOOGLE_APPLICATION_CREDENTIALS\"\n", "] = \"/home/inga/projects/misinformation-project/misinformation-notes/seismic-bonfire-329406-412821a70264.json\"\n", "images = mysubfiles[1:5]\n", "misinformation.explore_analysis(images, identify=\"text-on-image\")" ] }, { "cell_type": "markdown", "id": "d54407ad", "metadata": {}, "source": [ "## MS Azure\n", "https://docs.microsoft.com/en-us/azure/search/cognitive-search-concept-image-scenarios" ] }, { "cell_type": "code", "execution_count": null, "id": "365c78b1-7ff4-4213-86fa-6a0a2d05198f", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" }, "vscode": { "interpreter": { "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" } } }, "nbformat": 4, "nbformat_minor": 5 }