AMMICO/notebooks/get-text-from-image.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "dcaa3da1",
   "metadata": {},
   "source": [
    "# Notebook for text extraction on image\n",
    "Inga Ulusoy, SSC, July 2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf362e60",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from IPython.display import Image, display\n",
    "import misinformation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6da3a7aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "images = misinformation.find_files(path=\"../data/images-little-text/\", limit=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf811ce0",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in images[0:10]:\n",
    "    display(Image(filename=i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "366e2060",
   "metadata": {},
   "outputs": [],
   "source": [
    "# start with only English\n",
    "mysubfiles = [i for i in images if \"eng\" in i]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b330b267",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in mysubfiles[0:10]:\n",
    "    display(Image(filename=i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b32409f",
   "metadata": {},
   "outputs": [],
   "source": [
    "mydict = misinformation.utils.initialize_dict(images[0:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3be954ef-d31f-4e4d-857c-c14d5fda91f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "mydict"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b8b929f",
   "metadata": {},
   "source": [
    "# google cloud vision API\n",
    "First 1000 images per month are free."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbf74c0b-52fe-4fb8-b617-f18611e8f986",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ[\n",
    "    \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
    "] = \"../data/seismic-bonfire-329406-412821a70264.json\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0891b795-c7fe-454c-a45d-45fadf788142",
   "metadata": {},
   "source": [
    "## Inspect the elements per image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c6ecc88",
   "metadata": {},
   "outputs": [],
   "source": [
    "misinformation.explore_analysis(mydict, identify=\"text-on-image\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9c3e72b5-0e57-4019-b45e-3e36a74e7f52",
   "metadata": {},
   "source": [
    "## Or directly analyze for further processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "365c78b1-7ff4-4213-86fa-6a0a2d05198f",
   "metadata": {},
   "outputs": [],
   "source": [
    "for key in mydict:\n",
    "    print(key)\n",
    "    mydict[key] = misinformation.text.TextDetector(mydict[key]).analyse_image()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c75a2fff-9d59-4634-8d28-e90a909caa23",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c978fdb4-1f3a-4b78-b6ff-79c6e8a6fe82",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(mydict[\"104157S_eng\"][\"text\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3724f8d8-be0f-44eb-8ff9-b425eee94efc",
   "metadata": {},
   "outputs": [],
   "source": [
    "test = mydict[\"104157S_eng\"][\"text\"][0]\n",
    "print(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2fb865ab-6d16-4a4c-b004-9d39fcba6812",
   "metadata": {},
   "outputs": [],
   "source": [
    "# flake8-noqa-cell\n",
    "from googletrans import Translator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "051ad473-8f75-40fc-ae90-7d8176cf816f",
   "metadata": {},
   "outputs": [],
   "source": [
    "translator = Translator()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b571d900-8829-4095-904f-dfee3ce46041",
   "metadata": {},
   "outputs": [],
   "source": [
    "result = translator.translate(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1ff0b21-d2cc-4a50-8e86-a45362f1a0a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(result.text)\n",
    "print(result.src)\n",
    "print(result.origin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e135f00e-cdd5-4931-8649-ba0b293e8bdd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# now clean the gibberish\n",
    "# do spell check\n",
    "# flake8-noqa-cell\n",
    "import contextualSpellCheck\n",
    "import spacy\n",
    "\n",
    "nlp = spacy.load(\"en_core_web_md\")\n",
    "contextualSpellCheck.add_to_pipe(nlp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "01d17f6e-30ae-4cc1-ad03-11a3be6847c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "doc = nlp(result.text)\n",
    "print(doc._.outcome_spellCheck)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ef91dd2-70a8-4a04-b50f-5efba076bbb0",
   "metadata": {},
   "outputs": [],
   "source": [
    "final_text = []\n",
    "for token in doc:\n",
    "    if token.pos_ != \"SPACE\":\n",
    "        if token.pos_ != \"NUM\":\n",
    "            if token.has_vector:\n",
    "                final_text.append(token.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d0a3360-3fd9-4ab3-b4dc-122e90c16c7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\" \".join(final_text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1abe05e-0c2b-4769-8fa5-fc67297288ea",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  },
  "vscode": {
   "interpreter": {
    "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}