AMMICO/notebooks/get-text-from-image.ipynb
Inga Ulusoy 5d033479ce
Set up colab (#50)
* lower python version for google colab

* faces working with colab

* text for colab

* fix dict update bug

* final edits for colab

* update readme with links

* load text models on demand

* update test

* fix typo; more description in readme

* remove optional keys
2023-01-12 12:57:14 +01:00

222 строки
4.8 KiB
Plaintext
Generated

{
"cells": [
{
"cell_type": "markdown",
"id": "dcaa3da1",
"metadata": {},
"source": [
"# Notebook for text extraction on image\n",
"Inga Ulusoy, SSC, July 2022"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f43f327c",
"metadata": {},
"outputs": [],
"source": [
"# if running on google colab\n",
"# flake8-noqa-cell\n",
"import os\n",
"\n",
"if \"google.colab\" in str(get_ipython()):\n",
" # update python version\n",
" # install setuptools\n",
" !pip install setuptools==61 -qqq\n",
" # install misinformation\n",
" !pip install git+https://github.com/ssciwr/misinformation.git@set-up-colab -qqq\n",
" # mount google drive for data and API key\n",
" from google.colab import drive\n",
"\n",
" drive.mount(\"/content/drive\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cf362e60",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from IPython.display import Image, display\n",
"import misinformation\n",
"\n",
"# download the models if they are not there yet\n",
"!python -m spacy download en_core_web_md\n",
"!python -m textblob.download_corpora"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6da3a7aa",
"metadata": {},
"outputs": [],
"source": [
"images = misinformation.find_files(\n",
" path=\"drive/MyDrive/misinformation-data/\", limit=1000\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bf811ce0",
"metadata": {},
"outputs": [],
"source": [
"for i in images[0:3]:\n",
" display(Image(filename=i))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8b32409f",
"metadata": {},
"outputs": [],
"source": [
"mydict = misinformation.utils.initialize_dict(images[0:10])"
]
},
{
"cell_type": "markdown",
"id": "7b8b929f",
"metadata": {},
"source": [
"# google cloud vision API\n",
"First 1000 images per month are free."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cbf74c0b-52fe-4fb8-b617-f18611e8f986",
"metadata": {},
"outputs": [],
"source": [
"os.environ[\n",
" \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
"] = \"drive/MyDrive/misinformation-data/misinformation-campaign-981aa55a3b13.json\""
]
},
{
"cell_type": "markdown",
"id": "0891b795-c7fe-454c-a45d-45fadf788142",
"metadata": {},
"source": [
"## Inspect the elements per image"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c6ecc88",
"metadata": {},
"outputs": [],
"source": [
"misinformation.explore_analysis(mydict, identify=\"text-on-image\", analyse_text=True)"
]
},
{
"cell_type": "markdown",
"id": "9c3e72b5-0e57-4019-b45e-3e36a74e7f52",
"metadata": {},
"source": [
"## Or directly analyze for further processing"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "365c78b1-7ff4-4213-86fa-6a0a2d05198f",
"metadata": {},
"outputs": [],
"source": [
"for key in mydict:\n",
" print(key)\n",
" mydict[key] = misinformation.text.TextDetector(\n",
" mydict[key], analyse_text=True\n",
" ).analyse_image()"
]
},
{
"cell_type": "markdown",
"id": "3c063eda",
"metadata": {},
"source": [
"## Convert to dataframe and write csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5709c2cd",
"metadata": {},
"outputs": [],
"source": [
"outdict = misinformation.utils.append_data_to_dict(mydict)\n",
"df = misinformation.utils.dump_df(outdict)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c4f05637",
"metadata": {},
"outputs": [],
"source": [
"# check the dataframe\n",
"df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bf6c9ddb",
"metadata": {},
"outputs": [],
"source": [
"# Write the csv\n",
"df.to_csv(\"drive/MyDrive/misinformation-data/data_out.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "568537df",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "misinf",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6 (main, Oct 24 2022, 16:07:47) [GCC 11.2.0]"
},
"vscode": {
"interpreter": {
"hash": "da98320027a74839c7141b42ef24e2d47d628ba1f51115c13da5d8b45a372ec2"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}