зеркало из
				https://github.com/ssciwr/AMMICO.git
				synced 2025-10-31 05:56:05 +02:00 
			
		
		
		
	
		
			
				
	
	
		
			424 строки
		
	
	
		
			11 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
		
			Generated
		
	
	
			
		
		
	
	
			424 строки
		
	
	
		
			11 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
		
			Generated
		
	
	
| {
 | |
|  "cells": [
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "id": "dcaa3da1",
 | |
|    "metadata": {},
 | |
|    "source": [
 | |
|     "# Notebook for text extraction on image\n",
 | |
|     "Inga Ulusoy, SSC, July 2022"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "cf362e60",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "import os\n",
 | |
|     "from IPython.display import Image, display\n",
 | |
|     "import misinformation"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "6da3a7aa",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "images = misinformation.find_files(limit=1000)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "bf811ce0",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "for i in images:\n",
 | |
|     "    display(Image(filename=i))"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "366e2060",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "# start with only English\n",
 | |
|     "mysubfiles = [i for i in images if \"eng\" in i]"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "b330b267",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "for i in mysubfiles:\n",
 | |
|     "    display(Image(filename=i))"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "8b32409f",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "mydict = misinformation.utils.initialize_dict(mysubfiles)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "id": "07b7a7a3",
 | |
|    "metadata": {},
 | |
|    "source": [
 | |
|     "# Pre-process the images: Convert to greyscale and increase contrast"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "4cacfb0d",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "import cv2\n",
 | |
|     "from matplotlib import pyplot as plt\n",
 | |
|     "import numpy as np"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "c94b02cb-9e96-4812-8448-8bc731bfd8aa",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "! pip install matplotlib\n",
 | |
|     "! pip install numpy"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "8a2d3057",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "def preprocess(filename):\n",
 | |
|     "    \"\"\"Preprocess the image to enhance features for extraction.\"\"\"\n",
 | |
|     "    image = cv2.imread(filename)\n",
 | |
|     "    # preserve the original image\n",
 | |
|     "    #     original = image.copy()\n",
 | |
|     "    # Grayscale, Gaussian blur, Otsu's threshold\n",
 | |
|     "    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n",
 | |
|     "    # sharpen contrast by first smoothing and then substracting the smoothed and thresholded version\n",
 | |
|     "    sharpened = unsharp_mask(gray, amount=1.1, threshold=0.1)\n",
 | |
|     "    inverted = invert_image(sharpened)\n",
 | |
|     "    return gray, sharpened, inverted\n",
 | |
|     "\n",
 | |
|     "\n",
 | |
|     "# use unsharp mask algorithm from opencv\n",
 | |
|     "# https://docs.opencv.org/4.x/d1/d10/classcv_1_1MatExpr.html#details\n",
 | |
|     "def unsharp_mask(image, kernel_size=(5, 5), sigma=1.0, amount=1.0, threshold=0):\n",
 | |
|     "    \"\"\"Return a sharpened version of the image, using an unsharp mask.\n",
 | |
|     "    Amount: 1 is neutral, higher values result in shaprer images. threshold is the value below which the difference between blurred and original image gets discarded.\"\"\"\n",
 | |
|     "    blurred = cv2.GaussianBlur(image, kernel_size, sigma)\n",
 | |
|     "    sharpened = float(amount + 1) * image - float(amount) * blurred\n",
 | |
|     "    sharpened = np.maximum(sharpened, np.zeros(sharpened.shape))\n",
 | |
|     "    sharpened = np.minimum(sharpened, 255 * np.ones(sharpened.shape))\n",
 | |
|     "    sharpened = sharpened.round().astype(np.uint8)\n",
 | |
|     "    if threshold > 0:\n",
 | |
|     "        low_contrast_mask = np.absolute(image - blurred) < threshold\n",
 | |
|     "        np.copyto(sharpened, image, where=low_contrast_mask)\n",
 | |
|     "    return sharpened\n",
 | |
|     "\n",
 | |
|     "\n",
 | |
|     "def invert_image(image):\n",
 | |
|     "    return cv2.bitwise_not(image)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "a483868b",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "grey_image = []\n",
 | |
|     "for i in mysubfiles:\n",
 | |
|     "    grey_image.append(preprocess(i))"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "08ed750d",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "for image in grey_image:\n",
 | |
|     "    # disable default colormap in imshow\n",
 | |
|     "    plt.imshow(image[0], cmap=\"gray\", vmin=0, vmax=255)\n",
 | |
|     "    plt.imshow(image[1], cmap=\"gray\", vmin=0, vmax=255)\n",
 | |
|     "    plt.show()"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "id": "71ac2229",
 | |
|    "metadata": {},
 | |
|    "source": [
 | |
|     "mabe further preprocess in cropping out text regions..?"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "id": "7786d09c",
 | |
|    "metadata": {},
 | |
|    "source": [
 | |
|     "# Try out different libraries\n",
 | |
|     "## The standard go-to tool that is slightly complicated: pytesseract\n",
 | |
|     "Install tesseract and the language libraries:\n",
 | |
|     "```\n",
 | |
|     "sudo apt install tesseract-ocr  \n",
 | |
|     "sudo apt install tesseract-ocr-all  \n",
 | |
|     "sudo apt install imagemagick  \n",
 | |
|     "```  "
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "0d69504c",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "from pytesseract import pytesseract\n",
 | |
|     "\n",
 | |
|     "pytesseract.tesseract_cmd = r\"tesseract\""
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "0b27c98c-b437-4c8b-8844-96d8718eea49",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "! pip install pytesseract"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "529de8d8",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "myimage = grey_image[1]\n",
 | |
|     "plt.imshow(myimage[0], cmap=\"gray\", vmin=0, vmax=255)\n",
 | |
|     "plt.show()\n",
 | |
|     "\n",
 | |
|     "plt.imshow(myimage[1], cmap=\"gray\", vmin=0, vmax=255)\n",
 | |
|     "plt.show()\n",
 | |
|     "\n",
 | |
|     "plt.imshow(myimage[2], cmap=\"gray\", vmin=0, vmax=255)\n",
 | |
|     "plt.show()\n",
 | |
|     "\n",
 | |
|     "text = pytesseract.image_to_string(myimage[0])\n",
 | |
|     "print(text)\n",
 | |
|     "text = pytesseract.image_to_string(myimage[1])\n",
 | |
|     "print(text)\n",
 | |
|     "text = pytesseract.image_to_string(myimage[2])\n",
 | |
|     "print(text)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "id": "e25dd39e",
 | |
|    "metadata": {},
 | |
|    "source": [
 | |
|     "Here we probably would need to compare extractopm from different preprocessed images and overlay in a final text."
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "36495f3f",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "for image in mysubfiles:\n",
 | |
|     "    # Loading image using OpenCV\n",
 | |
|     "    img = cv2.imread(image)\n",
 | |
|     "\n",
 | |
|     "    # Preprocessing image\n",
 | |
|     "    # Converting to grayscale\n",
 | |
|     "    gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
 | |
|     "\n",
 | |
|     "    # creating Binary image by selecting proper threshold\n",
 | |
|     "    binary_image = cv2.threshold(\n",
 | |
|     "        gray_image, 130, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU\n",
 | |
|     "    )[1]\n",
 | |
|     "\n",
 | |
|     "    # Inverting the image\n",
 | |
|     "    inverted_bin = cv2.bitwise_not(binary_image)\n",
 | |
|     "\n",
 | |
|     "    # Some noise reduction\n",
 | |
|     "    kernel = np.ones((2, 2), np.uint8)\n",
 | |
|     "    processed_img = cv2.erode(inverted_bin, kernel, iterations=1)\n",
 | |
|     "    processed_img = cv2.dilate(processed_img, kernel, iterations=1)\n",
 | |
|     "\n",
 | |
|     "    # Applying image_to_string method\n",
 | |
|     "    text = pytesseract.image_to_string(processed_img)\n",
 | |
|     "    plt.imshow(processed_img, cmap=\"gray\", vmin=0, vmax=255)\n",
 | |
|     "    plt.show()\n",
 | |
|     "    print(text)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "id": "d6532019",
 | |
|    "metadata": {},
 | |
|    "source": [
 | |
|     "## keras-ocr\n",
 | |
|     "Not sure how to create an image object without a url.\n",
 | |
|     "https://keras-ocr.readthedocs.io/en/latest/examples/using_pretrained_models.html"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "44e38871",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "import keras_ocr\n",
 | |
|     "\n",
 | |
|     "pipeline = keras_ocr.pipeline.Pipeline()"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "2bb55068-ddd4-4b90-ae94-90181980d3c0",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "! pip install keras-ocr"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "0002f2c4",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "images = [\n",
 | |
|     "    keras_ocr.tools.read(url)\n",
 | |
|     "    for url in [\n",
 | |
|     "        \"https://storage.googleapis.com/gcptutorials.com/examples/keras-ocr-img-1.jpg\",\n",
 | |
|     "        \"https://storage.googleapis.com/gcptutorials.com/examples/keras-ocr-img-2.png\",\n",
 | |
|     "    ]\n",
 | |
|     "]"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "1567dc85",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "prediction_groups = pipeline.recognize(images)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "fb1ca152",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "predicted_image_1 = prediction_groups[0]\n",
 | |
|     "for text, box in predicted_image_1:\n",
 | |
|     "    print(text)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "id": "7b8b929f",
 | |
|    "metadata": {},
 | |
|    "source": [
 | |
|     "## google cloud vision API\n",
 | |
|     "First 1000 images per month are free."
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "7c6ecc88",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "os.environ[\n",
 | |
|     "    \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
 | |
|     "] = \"/home/inga/projects/misinformation-project/misinformation-notes/seismic-bonfire-329406-412821a70264.json\"\n",
 | |
|     "misinformation.explore_analysis(mydict, identify=\"text-on-image\")"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "id": "d54407ad",
 | |
|    "metadata": {},
 | |
|    "source": [
 | |
|     "## MS Azure\n",
 | |
|     "https://docs.microsoft.com/en-us/azure/search/cognitive-search-concept-image-scenarios"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "365c78b1-7ff4-4213-86fa-6a0a2d05198f",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": []
 | |
|   }
 | |
|  ],
 | |
|  "metadata": {
 | |
|   "kernelspec": {
 | |
|    "display_name": "Python 3 (ipykernel)",
 | |
|    "language": "python",
 | |
|    "name": "python3"
 | |
|   },
 | |
|   "language_info": {
 | |
|    "codemirror_mode": {
 | |
|     "name": "ipython",
 | |
|     "version": 3
 | |
|    },
 | |
|    "file_extension": ".py",
 | |
|    "mimetype": "text/x-python",
 | |
|    "name": "python",
 | |
|    "nbconvert_exporter": "python",
 | |
|    "pygments_lexer": "ipython3",
 | |
|    "version": "3.10.4"
 | |
|   },
 | |
|   "vscode": {
 | |
|    "interpreter": {
 | |
|     "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
 | |
|    }
 | |
|   }
 | |
|  },
 | |
|  "nbformat": 4,
 | |
|  "nbformat_minor": 5
 | |
| }
 | 
