зеркало из
https://github.com/ssciwr/AMMICO.git
synced 2025-11-02 15:06:07 +02:00
* start with translate * translate and clean - notebook * spacy model in requirements * translate in module * clean in module * upload coverage only for ubuntu * update ubuntu version on runner * update dependencies * start tests for text * skip gcv test * fix age * more text tests * more text tests * add comment * test translation * fix numpy version; add reference data for trans * use utf-8 for windows
307 строки
6.4 KiB
Plaintext
Generated
307 строки
6.4 KiB
Plaintext
Generated
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "dcaa3da1",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Notebook for text extraction on image\n",
|
|
"Inga Ulusoy, SSC, July 2022"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "cf362e60",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"from IPython.display import Image, display\n",
|
|
"import misinformation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "6da3a7aa",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"images = misinformation.find_files(path=\"../data/images-little-text/\", limit=1000)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "bf811ce0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"for i in images[0:10]:\n",
|
|
" display(Image(filename=i))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "366e2060",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# start with only English\n",
|
|
"mysubfiles = [i for i in images if \"eng\" in i]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b330b267",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"for i in mysubfiles[0:10]:\n",
|
|
" display(Image(filename=i))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8b32409f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"mydict = misinformation.utils.initialize_dict(images[0:10])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "3be954ef-d31f-4e4d-857c-c14d5fda91f1",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"mydict"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "7b8b929f",
|
|
"metadata": {},
|
|
"source": [
|
|
"# google cloud vision API\n",
|
|
"First 1000 images per month are free."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "cbf74c0b-52fe-4fb8-b617-f18611e8f986",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"os.environ[\n",
|
|
" \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
|
|
"] = \"../data/seismic-bonfire-329406-412821a70264.json\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "0891b795-c7fe-454c-a45d-45fadf788142",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Inspect the elements per image"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "7c6ecc88",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"misinformation.explore_analysis(mydict, identify=\"text-on-image\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "9c3e72b5-0e57-4019-b45e-3e36a74e7f52",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Or directly analyze for further processing"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "365c78b1-7ff4-4213-86fa-6a0a2d05198f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"for key in mydict:\n",
|
|
" print(key)\n",
|
|
" mydict[key] = misinformation.text.TextDetector(mydict[key]).analyse_image()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c75a2fff-9d59-4634-8d28-e90a909caa23",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c978fdb4-1f3a-4b78-b6ff-79c6e8a6fe82",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(mydict[\"104157S_eng\"][\"text\"])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "3724f8d8-be0f-44eb-8ff9-b425eee94efc",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"test = mydict[\"104157S_eng\"][\"text\"][0]\n",
|
|
"print(test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2fb865ab-6d16-4a4c-b004-9d39fcba6812",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# flake8-noqa-cell\n",
|
|
"from googletrans import Translator"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "051ad473-8f75-40fc-ae90-7d8176cf816f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"translator = Translator()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b571d900-8829-4095-904f-dfee3ce46041",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"result = translator.translate(test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e1ff0b21-d2cc-4a50-8e86-a45362f1a0a1",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(result.text)\n",
|
|
"print(result.src)\n",
|
|
"print(result.origin)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e135f00e-cdd5-4931-8649-ba0b293e8bdd",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# now clean the gibberish\n",
|
|
"# do spell check\n",
|
|
"# flake8-noqa-cell\n",
|
|
"import contextualSpellCheck\n",
|
|
"import spacy\n",
|
|
"\n",
|
|
"nlp = spacy.load(\"en_core_web_md\")\n",
|
|
"contextualSpellCheck.add_to_pipe(nlp)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "01d17f6e-30ae-4cc1-ad03-11a3be6847c2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"doc = nlp(result.text)\n",
|
|
"print(doc._.outcome_spellCheck)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "0ef91dd2-70a8-4a04-b50f-5efba076bbb0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"final_text = []\n",
|
|
"for token in doc:\n",
|
|
" if token.pos_ != \"SPACE\":\n",
|
|
" if token.pos_ != \"NUM\":\n",
|
|
" if token.has_vector:\n",
|
|
" final_text.append(token.text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "4d0a3360-3fd9-4ab3-b4dc-122e90c16c7c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(\" \".join(final_text))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b1abe05e-0c2b-4769-8fa5-fc67297288ea",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.6"
|
|
},
|
|
"vscode": {
|
|
"interpreter": {
|
|
"hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
|
|
}
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|