{ "cells": [ { "cell_type": "markdown", "id": "dcaa3da1", "metadata": {}, "source": [ "# Text extraction on image\n", "Inga Ulusoy, SSC, July 2022" ] }, { "cell_type": "code", "execution_count": null, "id": "f43f327c", "metadata": { "tags": [] }, "outputs": [], "source": [ "# if running on google colab\n", "# flake8-noqa-cell\n", "import os\n", "\n", "if \"google.colab\" in str(get_ipython()):\n", " # update python version\n", " # install setuptools\n", " !pip install setuptools==61 -qqq\n", " # install ammico\n", " !pip install git+https://github.com/ssciwr/ammico.git -qqq\n", " # mount google drive for data and API key\n", " from google.colab import drive\n", "\n", " drive.mount(\"/content/drive\")" ] }, { "cell_type": "code", "execution_count": null, "id": "cf362e60", "metadata": { "tags": [] }, "outputs": [], "source": [ "import ammico\n", "from ammico import utils as mutils\n", "from ammico import display as mdisplay" ] }, { "cell_type": "code", "execution_count": null, "id": "27675810", "metadata": { "tags": [] }, "outputs": [], "source": [ "# download the models if they are not there yet\n", "!python -m spacy download en_core_web_md\n", "!python -m textblob.download_corpora" ] }, { "cell_type": "code", "execution_count": null, "id": "6da3a7aa", "metadata": { "tags": [] }, "outputs": [], "source": [ "images = mutils.find_files(path=\"data\", limit=10)" ] }, { "cell_type": "code", "execution_count": null, "id": "8b32409f", "metadata": { "tags": [] }, "outputs": [], "source": [ "mydict = mutils.initialize_dict(images)" ] }, { "cell_type": "markdown", "id": "7b8b929f", "metadata": {}, "source": [ "## google cloud vision API\n", "First 1000 images per month are free." ] }, { "cell_type": "markdown", "id": "0891b795-c7fe-454c-a45d-45fadf788142", "metadata": {}, "source": [ "## Inspect the elements per image" ] }, { "cell_type": "code", "execution_count": null, "id": "7c6ecc88", "metadata": { "tags": [] }, "outputs": [], "source": [ "mdisplay.explore_analysis(mydict, identify=\"text-on-image\")" ] }, { "cell_type": "markdown", "id": "9c3e72b5-0e57-4019-b45e-3e36a74e7f52", "metadata": {}, "source": [ "## Or directly analyze for further processing" ] }, { "cell_type": "code", "execution_count": null, "id": "365c78b1-7ff4-4213-86fa-6a0a2d05198f", "metadata": { "tags": [] }, "outputs": [], "source": [ "for key in mydict:\n", " mydict[key] = ammico.text.TextDetector(\n", " mydict[key], analyse_text=True\n", " ).analyse_image()" ] }, { "cell_type": "markdown", "id": "3c063eda", "metadata": {}, "source": [ "## Convert to dataframe and write csv" ] }, { "cell_type": "code", "execution_count": null, "id": "5709c2cd", "metadata": { "tags": [] }, "outputs": [], "source": [ "outdict = mutils.append_data_to_dict(mydict)\n", "df = mutils.dump_df(outdict)" ] }, { "cell_type": "code", "execution_count": null, "id": "c4f05637", "metadata": { "tags": [] }, "outputs": [], "source": [ "# check the dataframe\n", "df.head(10)" ] }, { "cell_type": "code", "execution_count": null, "id": "bf6c9ddb", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Write the csv\n", "df.to_csv(\"./data_out.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "9012544e-f818-46ea-b087-3e150850a5d5", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" }, "vscode": { "interpreter": { "hash": "da98320027a74839c7141b42ef24e2d47d628ba1f51115c13da5d8b45a372ec2" } } }, "nbformat": 4, "nbformat_minor": 5 }