{ "cells": [ { "cell_type": "markdown", "id": "dcaa3da1", "metadata": {}, "source": [ "# Notebook for text extraction on image\n", "Inga Ulusoy, SSC, July 2022" ] }, { "cell_type": "code", "execution_count": null, "id": "cf362e60", "metadata": {}, "outputs": [], "source": [ "import os\n", "from IPython.display import Image, display\n", "import misinformation" ] }, { "cell_type": "code", "execution_count": null, "id": "6da3a7aa", "metadata": {}, "outputs": [], "source": [ "images = misinformation.find_files(path=\"../data/images-little-text/\", limit=1000)" ] }, { "cell_type": "code", "execution_count": null, "id": "bf811ce0", "metadata": {}, "outputs": [], "source": [ "for i in images[0:10]:\n", " display(Image(filename=i))" ] }, { "cell_type": "code", "execution_count": null, "id": "366e2060", "metadata": {}, "outputs": [], "source": [ "# start with only English\n", "mysubfiles = [i for i in images if \"eng\" in i]" ] }, { "cell_type": "code", "execution_count": null, "id": "b330b267", "metadata": {}, "outputs": [], "source": [ "for i in mysubfiles[0:10]:\n", " display(Image(filename=i))" ] }, { "cell_type": "code", "execution_count": null, "id": "8b32409f", "metadata": {}, "outputs": [], "source": [ "mydict = misinformation.utils.initialize_dict(images[0:10])" ] }, { "cell_type": "code", "execution_count": null, "id": "3be954ef-d31f-4e4d-857c-c14d5fda91f1", "metadata": {}, "outputs": [], "source": [ "mydict" ] }, { "cell_type": "markdown", "id": "7b8b929f", "metadata": {}, "source": [ "# google cloud vision API\n", "First 1000 images per month are free." ] }, { "cell_type": "code", "execution_count": null, "id": "cbf74c0b-52fe-4fb8-b617-f18611e8f986", "metadata": {}, "outputs": [], "source": [ "os.environ[\n", " \"GOOGLE_APPLICATION_CREDENTIALS\"\n", "] = \"../data/seismic-bonfire-329406-412821a70264.json\"" ] }, { "cell_type": "markdown", "id": "0891b795-c7fe-454c-a45d-45fadf788142", "metadata": {}, "source": [ "## Inspect the elements per image" ] }, { "cell_type": "code", "execution_count": null, "id": "7c6ecc88", "metadata": {}, "outputs": [], "source": [ "misinformation.explore_analysis(mydict, identify=\"text-on-image\")" ] }, { "cell_type": "markdown", "id": "9c3e72b5-0e57-4019-b45e-3e36a74e7f52", "metadata": {}, "source": [ "## Or directly analyze for further processing" ] }, { "cell_type": "code", "execution_count": null, "id": "365c78b1-7ff4-4213-86fa-6a0a2d05198f", "metadata": {}, "outputs": [], "source": [ "for key in mydict:\n", " print(key)\n", " mydict[key] = misinformation.text.TextDetector(mydict[key]).analyse_image()" ] }, { "cell_type": "code", "execution_count": null, "id": "c75a2fff-9d59-4634-8d28-e90a909caa23", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "c978fdb4-1f3a-4b78-b6ff-79c6e8a6fe82", "metadata": {}, "outputs": [], "source": [ "print(mydict[\"104157S_eng\"][\"text\"])" ] }, { "cell_type": "code", "execution_count": null, "id": "3724f8d8-be0f-44eb-8ff9-b425eee94efc", "metadata": {}, "outputs": [], "source": [ "test = mydict[\"104157S_eng\"][\"text\"][0]\n", "print(test)" ] }, { "cell_type": "code", "execution_count": null, "id": "2fb865ab-6d16-4a4c-b004-9d39fcba6812", "metadata": {}, "outputs": [], "source": [ "# flake8-noqa-cell\n", "from googletrans import Translator" ] }, { "cell_type": "code", "execution_count": null, "id": "051ad473-8f75-40fc-ae90-7d8176cf816f", "metadata": {}, "outputs": [], "source": [ "translator = Translator()" ] }, { "cell_type": "code", "execution_count": null, "id": "b571d900-8829-4095-904f-dfee3ce46041", "metadata": {}, "outputs": [], "source": [ "result = translator.translate(test)" ] }, { "cell_type": "code", "execution_count": null, "id": "e1ff0b21-d2cc-4a50-8e86-a45362f1a0a1", "metadata": {}, "outputs": [], "source": [ "print(result.text)\n", "print(result.src)\n", "print(result.origin)" ] }, { "cell_type": "code", "execution_count": null, "id": "e135f00e-cdd5-4931-8649-ba0b293e8bdd", "metadata": {}, "outputs": [], "source": [ "# now clean the gibberish\n", "# do spell check\n", "# flake8-noqa-cell\n", "import contextualSpellCheck\n", "import spacy\n", "\n", "nlp = spacy.load(\"en_core_web_md\")\n", "contextualSpellCheck.add_to_pipe(nlp)" ] }, { "cell_type": "code", "execution_count": null, "id": "01d17f6e-30ae-4cc1-ad03-11a3be6847c2", "metadata": {}, "outputs": [], "source": [ "doc = nlp(result.text)\n", "print(doc._.outcome_spellCheck)" ] }, { "cell_type": "code", "execution_count": null, "id": "0ef91dd2-70a8-4a04-b50f-5efba076bbb0", "metadata": {}, "outputs": [], "source": [ "final_text = []\n", "for token in doc:\n", " if token.pos_ != \"SPACE\":\n", " if token.pos_ != \"NUM\":\n", " if token.has_vector:\n", " final_text.append(token.text)" ] }, { "cell_type": "code", "execution_count": null, "id": "4d0a3360-3fd9-4ab3-b4dc-122e90c16c7c", "metadata": {}, "outputs": [], "source": [ "print(\" \".join(final_text))" ] }, { "cell_type": "code", "execution_count": null, "id": "b1abe05e-0c2b-4769-8fa5-fc67297288ea", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" }, "vscode": { "interpreter": { "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" } } }, "nbformat": 4, "nbformat_minor": 5 }