AMMICO/notebooks/get-text-from-image.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "dcaa3da1",
   "metadata": {},
   "source": [
    "# Notebook for text extraction on image\n",
    "Inga Ulusoy, SSC, July 2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf362e60",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from IPython.display import Image, display\n",
    "import misinformation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6da3a7aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "images = misinformation.find_files(path=\"../data/images-text/\", limit=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf811ce0",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in images[0:10]:\n",
    "    display(Image(filename=i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b32409f",
   "metadata": {},
   "outputs": [],
   "source": [
    "mydict = misinformation.utils.initialize_dict(images[0:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3be954ef-d31f-4e4d-857c-c14d5fda91f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "mydict"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b8b929f",
   "metadata": {},
   "source": [
    "# google cloud vision API\n",
    "First 1000 images per month are free."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbf74c0b-52fe-4fb8-b617-f18611e8f986",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ[\n",
    "    \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
    "] = \"../data/misinformation-campaign-981aa55a3b13.json\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0891b795-c7fe-454c-a45d-45fadf788142",
   "metadata": {},
   "source": [
    "## Inspect the elements per image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c6ecc88",
   "metadata": {},
   "outputs": [],
   "source": [
    "misinformation.explore_analysis(mydict, identify=\"text-on-image\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9c3e72b5-0e57-4019-b45e-3e36a74e7f52",
   "metadata": {},
   "source": [
    "## Or directly analyze for further processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "365c78b1-7ff4-4213-86fa-6a0a2d05198f",
   "metadata": {},
   "outputs": [],
   "source": [
    "for key in mydict:\n",
    "    print(key)\n",
    "    mydict[key] = misinformation.text.TextDetector(mydict[key]).analyse_image()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c978fdb4-1f3a-4b78-b6ff-79c6e8a6fe82",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(mydict[\"109237S_spa\"][\"text_clean\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3c063eda",
   "metadata": {},
   "source": [
    "## Convert to dataframe and write csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5709c2cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "outdict = misinformation.utils.append_data_to_dict(mydict)\n",
    "df = misinformation.utils.dump_df(outdict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4f05637",
   "metadata": {},
   "outputs": [],
   "source": [
    "# check the dataframe\n",
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf6c9ddb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Write the csv\n",
    "df.to_csv(\"./data_out.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "568537df",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  },
  "vscode": {
   "interpreter": {
    "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}