AMMICO/notebooks/get-text-from-image.ipynb

424 строки
11 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "dcaa3da1",
"metadata": {},
"source": [
"# Notebook for text extraction on image\n",
"Inga Ulusoy, SSC, July 2022"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cf362e60",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from IPython.display import Image, display\n",
"import misinformation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6da3a7aa",
"metadata": {},
"outputs": [],
"source": [
"images = misinformation.find_files(limit=1000)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bf811ce0",
"metadata": {},
"outputs": [],
"source": [
"for i in images:\n",
" display(Image(filename=i))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "366e2060",
"metadata": {},
"outputs": [],
"source": [
"# start with only English\n",
"mysubfiles = [i for i in images if \"eng\" in i]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b330b267",
"metadata": {},
"outputs": [],
"source": [
"for i in mysubfiles:\n",
" display(Image(filename=i))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8b32409f",
"metadata": {},
"outputs": [],
"source": [
"mydict = misinformation.utils.initialize_dict(mysubfiles)"
]
},
{
"cell_type": "markdown",
"id": "07b7a7a3",
"metadata": {},
"source": [
"# Pre-process the images: Convert to greyscale and increase contrast"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4cacfb0d",
"metadata": {},
"outputs": [],
"source": [
"import cv2\n",
"from matplotlib import pyplot as plt\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c94b02cb-9e96-4812-8448-8bc731bfd8aa",
"metadata": {},
"outputs": [],
"source": [
"! pip install matplotlib\n",
"! pip install numpy"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8a2d3057",
"metadata": {},
"outputs": [],
"source": [
"def preprocess(filename):\n",
" \"\"\"Preprocess the image to enhance features for extraction.\"\"\"\n",
" image = cv2.imread(filename)\n",
" # preserve the original image\n",
" # original = image.copy()\n",
" # Grayscale, Gaussian blur, Otsu's threshold\n",
" gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n",
" # sharpen contrast by first smoothing and then substracting the smoothed and thresholded version\n",
" sharpened = unsharp_mask(gray, amount=1.1, threshold=0.1)\n",
" inverted = invert_image(sharpened)\n",
" return gray, sharpened, inverted\n",
"\n",
"\n",
"# use unsharp mask algorithm from opencv\n",
"# https://docs.opencv.org/4.x/d1/d10/classcv_1_1MatExpr.html#details\n",
"def unsharp_mask(image, kernel_size=(5, 5), sigma=1.0, amount=1.0, threshold=0):\n",
" \"\"\"Return a sharpened version of the image, using an unsharp mask.\n",
" Amount: 1 is neutral, higher values result in shaprer images. threshold is the value below which the difference between blurred and original image gets discarded.\"\"\"\n",
" blurred = cv2.GaussianBlur(image, kernel_size, sigma)\n",
" sharpened = float(amount + 1) * image - float(amount) * blurred\n",
" sharpened = np.maximum(sharpened, np.zeros(sharpened.shape))\n",
" sharpened = np.minimum(sharpened, 255 * np.ones(sharpened.shape))\n",
" sharpened = sharpened.round().astype(np.uint8)\n",
" if threshold > 0:\n",
" low_contrast_mask = np.absolute(image - blurred) < threshold\n",
" np.copyto(sharpened, image, where=low_contrast_mask)\n",
" return sharpened\n",
"\n",
"\n",
"def invert_image(image):\n",
" return cv2.bitwise_not(image)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a483868b",
"metadata": {},
"outputs": [],
"source": [
"grey_image = []\n",
"for i in mysubfiles:\n",
" grey_image.append(preprocess(i))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08ed750d",
"metadata": {},
"outputs": [],
"source": [
"for image in grey_image:\n",
" # disable default colormap in imshow\n",
" plt.imshow(image[0], cmap=\"gray\", vmin=0, vmax=255)\n",
" plt.imshow(image[1], cmap=\"gray\", vmin=0, vmax=255)\n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"id": "71ac2229",
"metadata": {},
"source": [
"mabe further preprocess in cropping out text regions..?"
]
},
{
"cell_type": "markdown",
"id": "7786d09c",
"metadata": {},
"source": [
"# Try out different libraries\n",
"## The standard go-to tool that is slightly complicated: pytesseract\n",
"Install tesseract and the language libraries:\n",
"```\n",
"sudo apt install tesseract-ocr \n",
"sudo apt install tesseract-ocr-all \n",
"sudo apt install imagemagick \n",
"``` "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0d69504c",
"metadata": {},
"outputs": [],
"source": [
"from pytesseract import pytesseract\n",
"\n",
"pytesseract.tesseract_cmd = r\"tesseract\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0b27c98c-b437-4c8b-8844-96d8718eea49",
"metadata": {},
"outputs": [],
"source": [
"! pip install pytesseract"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "529de8d8",
"metadata": {},
"outputs": [],
"source": [
"myimage = grey_image[1]\n",
"plt.imshow(myimage[0], cmap=\"gray\", vmin=0, vmax=255)\n",
"plt.show()\n",
"\n",
"plt.imshow(myimage[1], cmap=\"gray\", vmin=0, vmax=255)\n",
"plt.show()\n",
"\n",
"plt.imshow(myimage[2], cmap=\"gray\", vmin=0, vmax=255)\n",
"plt.show()\n",
"\n",
"text = pytesseract.image_to_string(myimage[0])\n",
"print(text)\n",
"text = pytesseract.image_to_string(myimage[1])\n",
"print(text)\n",
"text = pytesseract.image_to_string(myimage[2])\n",
"print(text)"
]
},
{
"cell_type": "markdown",
"id": "e25dd39e",
"metadata": {},
"source": [
"Here we probably would need to compare extractopm from different preprocessed images and overlay in a final text."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "36495f3f",
"metadata": {},
"outputs": [],
"source": [
"for image in mysubfiles:\n",
" # Loading image using OpenCV\n",
" img = cv2.imread(image)\n",
"\n",
" # Preprocessing image\n",
" # Converting to grayscale\n",
" gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
"\n",
" # creating Binary image by selecting proper threshold\n",
" binary_image = cv2.threshold(\n",
" gray_image, 130, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU\n",
" )[1]\n",
"\n",
" # Inverting the image\n",
" inverted_bin = cv2.bitwise_not(binary_image)\n",
"\n",
" # Some noise reduction\n",
" kernel = np.ones((2, 2), np.uint8)\n",
" processed_img = cv2.erode(inverted_bin, kernel, iterations=1)\n",
" processed_img = cv2.dilate(processed_img, kernel, iterations=1)\n",
"\n",
" # Applying image_to_string method\n",
" text = pytesseract.image_to_string(processed_img)\n",
" plt.imshow(processed_img, cmap=\"gray\", vmin=0, vmax=255)\n",
" plt.show()\n",
" print(text)"
]
},
{
"cell_type": "markdown",
"id": "d6532019",
"metadata": {},
"source": [
"## keras-ocr\n",
"Not sure how to create an image object without a url.\n",
"https://keras-ocr.readthedocs.io/en/latest/examples/using_pretrained_models.html"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "44e38871",
"metadata": {},
"outputs": [],
"source": [
"import keras_ocr\n",
"\n",
"pipeline = keras_ocr.pipeline.Pipeline()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2bb55068-ddd4-4b90-ae94-90181980d3c0",
"metadata": {},
"outputs": [],
"source": [
"! pip install keras-ocr"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0002f2c4",
"metadata": {},
"outputs": [],
"source": [
"images = [\n",
" keras_ocr.tools.read(url)\n",
" for url in [\n",
" \"https://storage.googleapis.com/gcptutorials.com/examples/keras-ocr-img-1.jpg\",\n",
" \"https://storage.googleapis.com/gcptutorials.com/examples/keras-ocr-img-2.png\",\n",
" ]\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1567dc85",
"metadata": {},
"outputs": [],
"source": [
"prediction_groups = pipeline.recognize(images)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fb1ca152",
"metadata": {},
"outputs": [],
"source": [
"predicted_image_1 = prediction_groups[0]\n",
"for text, box in predicted_image_1:\n",
" print(text)"
]
},
{
"cell_type": "markdown",
"id": "7b8b929f",
"metadata": {},
"source": [
"## google cloud vision API\n",
"First 1000 images per month are free."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c6ecc88",
"metadata": {},
"outputs": [],
"source": [
"os.environ[\n",
" \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
"] = \"/home/inga/projects/misinformation-project/misinformation-notes/seismic-bonfire-329406-412821a70264.json\"\n",
"misinformation.explore_analysis(mydict, identify=\"text-on-image\")"
]
},
{
"cell_type": "markdown",
"id": "d54407ad",
"metadata": {},
"source": [
"## MS Azure\n",
"https://docs.microsoft.com/en-us/azure/search/cognitive-search-concept-image-scenarios"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "365c78b1-7ff4-4213-86fa-6a0a2d05198f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
},
"vscode": {
"interpreter": {
"hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}