AMMICO/build/html/notebooks/Example text.html

713 строки
74 KiB
HTML

<!DOCTYPE html>
<html class="writer-html5" lang="en" >
<head>
<meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Notebook for text extraction on image &mdash; AMMICO 0.0.1 documentation</title>
<link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
<link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
<link rel="stylesheet" href="../_static/nbsphinx-code-cells.css" type="text/css" />
<!--[if lt IE 9]>
<script src="../_static/js/html5shiv.min.js"></script>
<![endif]-->
<script src="../_static/jquery.js?v=5d32c60e"></script>
<script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../_static/documentation_options.js?v=d45e8c67"></script>
<script src="../_static/doctools.js?v=888ff710"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script>window.MathJax = {"tex": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true}, "options": {"ignoreHtmlClass": "tex2jax_ignore|mathjax_ignore|document", "processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script>
<script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
<script src="../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Image summary and visual question answering" href="Example%20summary.html" />
<link rel="prev" title="Facial Expression recognition with DeepFace" href="Example%20faces.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../index.html" class="icon icon-home">
AMMICO
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../readme_link.html">AMMICO - AI Media and Misinformation Content Analysis Tool</a></li>
<li class="toctree-l1"><a class="reference internal" href="Example%20faces.html">Facial Expression recognition with DeepFace</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Notebook for text extraction on image</a><ul>
<li class="toctree-l2"><a class="reference internal" href="#Google-cloud-vision-API">Google cloud vision API</a></li>
<li class="toctree-l2"><a class="reference internal" href="#Inspect-the-elements-per-image">Inspect the elements per image</a></li>
<li class="toctree-l2"><a class="reference internal" href="#Or-directly-analyze-for-further-processing">Or directly analyze for further processing</a></li>
<li class="toctree-l2"><a class="reference internal" href="#Convert-to-dataframe-and-write-csv">Convert to dataframe and write csv</a></li>
<li class="toctree-l2"><a class="reference internal" href="#Topic-analysis">Topic analysis</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#Option-1:-Use-the-dictionary-as-obtained-from-the-above-analysis.">Option 1: Use the dictionary as obtained from the above analysis.</a></li>
<li class="toctree-l3"><a class="reference internal" href="#Option-2:-Read-in-a-csv">Option 2: Read in a csv</a></li>
<li class="toctree-l3"><a class="reference internal" href="#Access-frequent-topics">Access frequent topics</a></li>
<li class="toctree-l3"><a class="reference internal" href="#Get-information-for-specific-topic">Get information for specific topic</a></li>
<li class="toctree-l3"><a class="reference internal" href="#Topic-visualization">Topic visualization</a></li>
<li class="toctree-l3"><a class="reference internal" href="#Save-the-model">Save the model</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="Example%20summary.html">Image summary and visual question answering</a></li>
<li class="toctree-l1"><a class="reference internal" href="Example%20multimodal.html">Image Multimodal Search</a></li>
<li class="toctree-l1"><a class="reference internal" href="Example%20colors.html">Color analysis of pictures</a></li>
<li class="toctree-l1"><a class="reference internal" href="Example%20cropposts.html">Crop posts from social media posts images</a></li>
<li class="toctree-l1"><a class="reference internal" href="../modules.html">AMMICO package modules</a></li>
<li class="toctree-l1"><a class="reference internal" href="../license_link.html">License</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../index.html">AMMICO</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item active">Notebook for text extraction on image</li>
<li class="wy-breadcrumbs-aside">
<a href="../_sources/notebooks/Example text.ipynb.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="Notebook-for-text-extraction-on-image">
<h1>Notebook for text extraction on image<a class="headerlink" href="#Notebook-for-text-extraction-on-image" title="Link to this heading"></a></h1>
<p>The text extraction and analysis is carried out using a variety of tools:</p>
<ol class="arabic simple">
<li><p>Text extraction from the image using <a class="reference external" href="https://cloud.google.com/vision">google-cloud-vision</a></p></li>
<li><p>Language detection of the extracted text using <a class="reference external" href="https://py-googletrans.readthedocs.io/en/latest/">Googletrans</a></p></li>
<li><p>Translation into English or other languages using <a class="reference external" href="https://py-googletrans.readthedocs.io/en/latest/">Googletrans</a></p></li>
<li><p>Cleaning of the text using <a class="reference external" href="https://spacy.io/">spacy</a></p></li>
<li><p>Spell-check using <a class="reference external" href="https://textblob.readthedocs.io/en/dev/index.html">TextBlob</a></p></li>
<li><p>Subjectivity analysis using <a class="reference external" href="https://textblob.readthedocs.io/en/dev/index.html">TextBlob</a></p></li>
<li><p>Text summarization using <a class="reference external" href="https://huggingface.co/docs/transformers/index">transformers</a> pipelines</p></li>
<li><p>Sentiment analysis using <a class="reference external" href="https://huggingface.co/docs/transformers/index">transformers</a> pipelines</p></li>
<li><p>Named entity recognition using <a class="reference external" href="https://huggingface.co/docs/transformers/index">transformers</a> pipelines</p></li>
<li><p>Topic analysis using <a class="reference external" href="https://github.com/MaartenGr/BERTopic">BERTopic</a></p></li>
</ol>
<p>The first cell is only run on google colab and installs the <a class="reference external" href="https://github.com/ssciwr/AMMICO">ammico</a> package.</p>
<p>After that, we can import <code class="docutils literal notranslate"><span class="pre">ammico</span></code> and read in the files given a folder path.</p>
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[1]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># if running on google colab</span>
<span class="c1"># flake8-noqa-cell</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="k">if</span> <span class="s2">&quot;google.colab&quot;</span> <span class="ow">in</span> <span class="nb">str</span><span class="p">(</span><span class="n">get_ipython</span><span class="p">()):</span>
<span class="c1"># update python version</span>
<span class="c1"># install setuptools</span>
<span class="c1"># %pip install setuptools==61 -qqq</span>
<span class="c1"># install ammico</span>
<span class="o">%</span><span class="k">pip</span> install git+https://github.com/ssciwr/ammico.git -qqq
<span class="c1"># mount google drive for data and API key</span>
<span class="kn">from</span> <span class="nn">google.colab</span> <span class="kn">import</span> <span class="n">drive</span>
<span class="n">drive</span><span class="o">.</span><span class="n">mount</span><span class="p">(</span><span class="s2">&quot;/content/drive&quot;</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">ammico</span>
<span class="kn">from</span> <span class="nn">ammico</span> <span class="kn">import</span> <span class="n">utils</span> <span class="k">as</span> <span class="n">mutils</span>
<span class="kn">from</span> <span class="nn">ammico</span> <span class="kn">import</span> <span class="n">display</span> <span class="k">as</span> <span class="n">mdisplay</span>
</pre></div>
</div>
</div>
<p>We select a subset of image files to try the text extraction on, see the <code class="docutils literal notranslate"><span class="pre">limit</span></code> keyword. The <code class="docutils literal notranslate"><span class="pre">find_files</span></code> function finds image files within a given directory:</p>
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[3]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Here you need to provide the path to your google drive folder</span>
<span class="c1"># or local folder containing the images</span>
<span class="n">images</span> <span class="o">=</span> <span class="n">mutils</span><span class="o">.</span><span class="n">find_files</span><span class="p">(</span>
<span class="n">path</span><span class="o">=</span><span class="s2">&quot;data/&quot;</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span>
<span class="p">)</span>
</pre></div>
</div>
</div>
<p>We need to initialize the main dictionary that contains all information for the images and is updated through each subsequent analysis:</p>
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[4]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">mydict</span> <span class="o">=</span> <span class="n">mutils</span><span class="o">.</span><span class="n">initialize_dict</span><span class="p">(</span><span class="n">images</span><span class="p">)</span>
</pre></div>
</div>
</div>
<section id="Google-cloud-vision-API">
<h2>Google cloud vision API<a class="headerlink" href="#Google-cloud-vision-API" title="Link to this heading"></a></h2>
<p>For this you need an API key and have the app activated in your google console. The first 1000 images per month are free (July 2022).</p>
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>os.environ[
&quot;GOOGLE_APPLICATION_CREDENTIALS&quot;
] = &quot;your-credentials.json&quot;
</pre></div>
</div>
</section>
<section id="Inspect-the-elements-per-image">
<h2>Inspect the elements per image<a class="headerlink" href="#Inspect-the-elements-per-image" title="Link to this heading"></a></h2>
<p>To check the analysis, you can inspect the analyzed elements here. Loading the results takes a moment, so please be patient. If you are sure of what you are doing, you can skip this and directly export a csv file in the step below. Here, we display the text extraction and translation results provided by the above libraries. Click on the tabs to see the results in the right sidebar. You may need to increment the <code class="docutils literal notranslate"><span class="pre">port</span></code> number if you are already running several notebook instances on the same
server.</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[5]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">analysis_explorer</span> <span class="o">=</span> <span class="n">mdisplay</span><span class="o">.</span><span class="n">AnalysisExplorer</span><span class="p">(</span><span class="n">mydict</span><span class="p">,</span> <span class="n">identify</span><span class="o">=</span><span class="s2">&quot;text-on-image&quot;</span><span class="p">)</span>
<span class="n">analysis_explorer</span><span class="o">.</span><span class="n">run_server</span><span class="p">(</span><span class="n">port</span><span class="o">=</span><span class="mi">8054</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
<span class="ansi-red-fg">---------------------------------------------------------------------------</span>
<span class="ansi-red-fg">TypeError</span> Traceback (most recent call last)
Cell <span class="ansi-green-fg">In[5], line 1</span>
<span class="ansi-green-fg">----&gt; 1</span> analysis_explorer <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">mdisplay</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">AnalysisExplorer</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">mydict</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">identify</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">&#34;</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">text-on-image</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">&#34;</span><span class="ansi-yellow-bg">)</span>
<span class="ansi-green-intense-fg ansi-bold"> 2</span> analysis_explorer<span style="color: rgb(98,98,98)">.</span>run_server(port<span style="color: rgb(98,98,98)">=</span><span style="color: rgb(98,98,98)">8054</span>)
<span class="ansi-red-fg">TypeError</span>: __init__() got an unexpected keyword argument &#39;identify&#39;
</pre></div></div>
</div>
</section>
<section id="Or-directly-analyze-for-further-processing">
<h2>Or directly analyze for further processing<a class="headerlink" href="#Or-directly-analyze-for-further-processing" title="Link to this heading"></a></h2>
<p>Instead of inspecting each of the images, you can also directly carry out the analysis and export the result into a csv. This may take a while depending on how many images you have loaded. Set the keyword <code class="docutils literal notranslate"><span class="pre">analyse_text</span></code> to <code class="docutils literal notranslate"><span class="pre">True</span></code> if you want the text to be analyzed (spell check, subjectivity, text summary, sentiment, NER).</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[6]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">mydict</span><span class="p">:</span>
<span class="n">mydict</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">ammico</span><span class="o">.</span><span class="n">text</span><span class="o">.</span><span class="n">TextDetector</span><span class="p">(</span>
<span class="n">mydict</span><span class="p">[</span><span class="n">key</span><span class="p">],</span> <span class="n">analyse_text</span><span class="o">=</span><span class="kc">True</span>
<span class="p">)</span><span class="o">.</span><span class="n">analyse_image</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
Collecting en-core-web-md==3.7.0
Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.0/en_core_web_md-3.7.0-py3-none-any.whl (42.8 MB)
<span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">42.8/42.8 MB</span> <span class="ansi-red-fg">57.8 MB/s</span> eta <span class="ansi-cyan-fg">0:00:00</span>
Requirement already satisfied: spacy&lt;3.8.0,&gt;=3.7.0 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from en-core-web-md==3.7.0) (3.7.2)
Requirement already satisfied: spacy-legacy&lt;3.1.0,&gt;=3.0.11 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (3.0.12)
Requirement already satisfied: spacy-loggers&lt;2.0.0,&gt;=1.0.0 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (1.0.5)
Requirement already satisfied: murmurhash&lt;1.1.0,&gt;=0.28.0 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (1.0.10)
Requirement already satisfied: cymem&lt;2.1.0,&gt;=2.0.2 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (2.0.8)
Requirement already satisfied: preshed&lt;3.1.0,&gt;=3.0.2 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (3.0.9)
Requirement already satisfied: thinc&lt;8.3.0,&gt;=8.1.8 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (8.2.1)
Requirement already satisfied: wasabi&lt;1.2.0,&gt;=0.9.1 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (1.1.2)
Requirement already satisfied: srsly&lt;3.0.0,&gt;=2.4.3 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (2.4.8)
Requirement already satisfied: catalogue&lt;2.1.0,&gt;=2.0.6 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (2.0.10)
Requirement already satisfied: weasel&lt;0.4.0,&gt;=0.1.0 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (0.3.3)
Requirement already satisfied: typer&lt;0.10.0,&gt;=0.3.0 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (0.9.0)
Requirement already satisfied: smart-open&lt;7.0.0,&gt;=5.2.1 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (6.4.0)
Requirement already satisfied: tqdm&lt;5.0.0,&gt;=4.38.0 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (4.66.1)
Requirement already satisfied: requests&lt;3.0.0,&gt;=2.13.0 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (2.31.0)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,&lt;3.0.0,&gt;=1.7.4 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (1.10.13)
Requirement already satisfied: jinja2 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (3.1.2)
Requirement already satisfied: setuptools in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (58.1.0)
Requirement already satisfied: packaging&gt;=20.0 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (23.2)
Requirement already satisfied: langcodes&lt;4.0.0,&gt;=3.2.0 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (3.3.0)
Requirement already satisfied: numpy&gt;=1.19.0 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (1.23.4)
Requirement already satisfied: typing-extensions&gt;=4.2.0 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from pydantic!=1.8,!=1.8.1,&lt;3.0.0,&gt;=1.7.4-&gt;spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (4.5.0)
Requirement already satisfied: charset-normalizer&lt;4,&gt;=2 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from requests&lt;3.0.0,&gt;=2.13.0-&gt;spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (3.3.0)
Requirement already satisfied: idna&lt;4,&gt;=2.5 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from requests&lt;3.0.0,&gt;=2.13.0-&gt;spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (2.10)
Requirement already satisfied: urllib3&lt;3,&gt;=1.21.1 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from requests&lt;3.0.0,&gt;=2.13.0-&gt;spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (2.0.7)
Requirement already satisfied: certifi&gt;=2017.4.17 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from requests&lt;3.0.0,&gt;=2.13.0-&gt;spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (2023.7.22)
Requirement already satisfied: blis&lt;0.8.0,&gt;=0.7.8 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from thinc&lt;8.3.0,&gt;=8.1.8-&gt;spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (0.7.11)
Requirement already satisfied: confection&lt;1.0.0,&gt;=0.0.1 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from thinc&lt;8.3.0,&gt;=8.1.8-&gt;spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (0.1.3)
Requirement already satisfied: click&lt;9.0.0,&gt;=7.1.1 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from typer&lt;0.10.0,&gt;=0.3.0-&gt;spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (8.1.7)
Requirement already satisfied: cloudpathlib&lt;0.17.0,&gt;=0.7.0 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from weasel&lt;0.4.0,&gt;=0.1.0-&gt;spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (0.16.0)
Requirement already satisfied: MarkupSafe&gt;=2.0 in /opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages (from jinja2-&gt;spacy&lt;3.8.0,&gt;=3.7.0-&gt;en-core-web-md==3.7.0) (2.1.3)
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.0
</pre></div></div>
</div>
<div class="nboutput docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area stderr docutils container">
<div class="highlight"><pre>
<span class="ansi-bold">[</span><span class="ansi-blue-fg">notice</span><span class="ansi-bold">]</span> A new release of pip is available: <span class="ansi-red-fg">23.0.1</span> -&gt; <span class="ansi-green-fg">23.3</span>
<span class="ansi-bold">[</span><span class="ansi-blue-fg">notice</span><span class="ansi-bold">]</span> To update, run: <span class="ansi-green-fg">pip install --upgrade pip</span>
</pre></div></div>
</div>
<div class="nboutput docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
<span class="ansi-green-fg">✔ Download and installation successful</span>
You can now load the package via spacy.load(&#39;en_core_web_md&#39;)
</pre></div></div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
<span class="ansi-red-fg">---------------------------------------------------------------------------</span>
<span class="ansi-red-fg">FileNotFoundError</span> Traceback (most recent call last)
Cell <span class="ansi-green-fg">In[6], line 2</span>
<span class="ansi-green-intense-fg ansi-bold"> 1</span> <span class="ansi-bold" style="color: rgb(0,135,0)">for</span> key <span class="ansi-bold" style="color: rgb(175,0,255)">in</span> mydict:
<span class="ansi-green-fg">----&gt; 2</span> mydict[key] <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">ammico</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">text</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">TextDetector</span><span class="ansi-yellow-bg">(</span>
<span class="ansi-green-intense-fg ansi-bold"> 3</span> <span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">mydict</span><span class="ansi-yellow-bg">[</span><span class="ansi-yellow-bg">key</span><span class="ansi-yellow-bg">]</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">analyse_text</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg ansi-bold" style="color: rgb(0,135,0)">True</span>
<span class="ansi-green-intense-fg ansi-bold"> 4</span> <span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">)</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">analyse_image</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">)</span>
File <span class="ansi-green-fg">~/work/AMMICO/AMMICO/ammico/text.py:158</span>, in <span class="ansi-cyan-fg">TextDetector.analyse_image</span><span class="ansi-blue-fg">(self)</span>
<span class="ansi-green-intense-fg ansi-bold"> 152</span> <span class="ansi-bold" style="color: rgb(0,135,0)">def</span> <span style="color: rgb(0,0,255)">analyse_image</span>(<span style="color: rgb(0,135,0)">self</span>) <span style="color: rgb(98,98,98)">-</span><span style="color: rgb(98,98,98)">&gt;</span> <span style="color: rgb(0,135,0)">dict</span>:
<span class="ansi-green-intense-fg ansi-bold"> 153</span> <span style="color: rgb(188,188,188)"> </span><span style="color: rgb(175,0,0)">&#34;&#34;&#34;Perform text extraction and analysis of the text.</span>
<span class="ansi-green-intense-fg ansi-bold"> 154</span>
<span class="ansi-green-intense-fg ansi-bold"> 155</span> <span style="color: rgb(175,0,0)"> Returns:</span>
<span class="ansi-green-intense-fg ansi-bold"> 156</span> <span style="color: rgb(175,0,0)"> dict: The updated dictionary with text analysis results.</span>
<span class="ansi-green-intense-fg ansi-bold"> 157</span> <span style="color: rgb(175,0,0)"> &#34;&#34;&#34;</span>
<span class="ansi-green-fg">--&gt; 158</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">get_text_from_image</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">)</span>
<span class="ansi-green-intense-fg ansi-bold"> 159</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>translate_text()
<span class="ansi-green-intense-fg ansi-bold"> 160</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>remove_linebreaks()
File <span class="ansi-green-fg">~/work/AMMICO/AMMICO/ammico/text.py:178</span>, in <span class="ansi-cyan-fg">TextDetector.get_text_from_image</span><span class="ansi-blue-fg">(self)</span>
<span class="ansi-green-intense-fg ansi-bold"> 174</span> <span class="ansi-bold" style="color: rgb(0,135,0)">except</span> DefaultCredentialsError:
<span class="ansi-green-intense-fg ansi-bold"> 175</span> <span class="ansi-bold" style="color: rgb(0,135,0)">raise</span> DefaultCredentialsError(
<span class="ansi-green-intense-fg ansi-bold"> 176</span> <span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">Please provide credentials for google cloud vision API, see https://cloud.google.com/docs/authentication/application-default-credentials.</span><span style="color: rgb(175,0,0)">&#34;</span>
<span class="ansi-green-intense-fg ansi-bold"> 177</span> )
<span class="ansi-green-fg">--&gt; 178</span> <span class="ansi-bold" style="color: rgb(0,135,0)">with</span> <span class="ansi-yellow-bg">io</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">open</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">path</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">&#34;</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">rb</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">&#34;</span><span class="ansi-yellow-bg">)</span> <span class="ansi-bold" style="color: rgb(0,135,0)">as</span> image_file:
<span class="ansi-green-intense-fg ansi-bold"> 179</span> content <span style="color: rgb(98,98,98)">=</span> image_file<span style="color: rgb(98,98,98)">.</span>read()
<span class="ansi-green-intense-fg ansi-bold"> 180</span> image <span style="color: rgb(98,98,98)">=</span> vision<span style="color: rgb(98,98,98)">.</span>Image(content<span style="color: rgb(98,98,98)">=</span>content)
<span class="ansi-red-fg">FileNotFoundError</span>: [Errno 2] No such file or directory: &#39;102141_2_eng&#39;
</pre></div></div>
</div>
</section>
<section id="Convert-to-dataframe-and-write-csv">
<h2>Convert to dataframe and write csv<a class="headerlink" href="#Convert-to-dataframe-and-write-csv" title="Link to this heading"></a></h2>
<p>These steps are required to convert the dictionary of dictionarys into a dictionary with lists, that can be converted into a pandas dataframe and exported to a csv file.</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[7]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">outdict</span> <span class="o">=</span> <span class="n">mutils</span><span class="o">.</span><span class="n">append_data_to_dict</span><span class="p">(</span><span class="n">mydict</span><span class="p">)</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">mutils</span><span class="o">.</span><span class="n">dump_df</span><span class="p">(</span><span class="n">outdict</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
<span class="ansi-red-fg">---------------------------------------------------------------------------</span>
<span class="ansi-red-fg">ValueError</span> Traceback (most recent call last)
Cell <span class="ansi-green-fg">In[7], line 2</span>
<span class="ansi-green-intense-fg ansi-bold"> 1</span> outdict <span style="color: rgb(98,98,98)">=</span> mutils<span style="color: rgb(98,98,98)">.</span>append_data_to_dict(mydict)
<span class="ansi-green-fg">----&gt; 2</span> df <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">mutils</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">dump_df</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">outdict</span><span class="ansi-yellow-bg">)</span>
File <span class="ansi-green-fg">~/work/AMMICO/AMMICO/ammico/utils.py:222</span>, in <span class="ansi-cyan-fg">dump_df</span><span class="ansi-blue-fg">(mydict)</span>
<span class="ansi-green-intense-fg ansi-bold"> 220</span> <span class="ansi-bold" style="color: rgb(0,135,0)">def</span> <span style="color: rgb(0,0,255)">dump_df</span>(mydict: <span style="color: rgb(0,135,0)">dict</span>) <span style="color: rgb(98,98,98)">-</span><span style="color: rgb(98,98,98)">&gt;</span> DataFrame:
<span class="ansi-green-intense-fg ansi-bold"> 221</span> <span style="color: rgb(188,188,188)"> </span><span style="color: rgb(175,0,0)">&#34;&#34;&#34;Utility to dump the dictionary into a dataframe.&#34;&#34;&#34;</span>
<span class="ansi-green-fg">--&gt; 222</span> <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg">DataFrame</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">from_dict</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">mydict</span><span class="ansi-yellow-bg">)</span>
File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages/pandas/core/frame.py:1816</span>, in <span class="ansi-cyan-fg">DataFrame.from_dict</span><span class="ansi-blue-fg">(cls, data, orient, dtype, columns)</span>
<span class="ansi-green-intense-fg ansi-bold"> 1810</span> <span class="ansi-bold" style="color: rgb(0,135,0)">raise</span> <span class="ansi-bold" style="color: rgb(215,95,95)">ValueError</span>(
<span class="ansi-green-intense-fg ansi-bold"> 1811</span> <span style="color: rgb(175,0,0)">f</span><span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">Expected </span><span style="color: rgb(175,0,0)">&#39;</span><span style="color: rgb(175,0,0)">index</span><span style="color: rgb(175,0,0)">&#39;</span><span style="color: rgb(175,0,0)">, </span><span style="color: rgb(175,0,0)">&#39;</span><span style="color: rgb(175,0,0)">columns</span><span style="color: rgb(175,0,0)">&#39;</span><span style="color: rgb(175,0,0)"> or </span><span style="color: rgb(175,0,0)">&#39;</span><span style="color: rgb(175,0,0)">tight</span><span style="color: rgb(175,0,0)">&#39;</span><span style="color: rgb(175,0,0)"> for orient parameter. </span><span style="color: rgb(175,0,0)">&#34;</span>
<span class="ansi-green-intense-fg ansi-bold"> 1812</span> <span style="color: rgb(175,0,0)">f</span><span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">Got </span><span style="color: rgb(175,0,0)">&#39;</span><span class="ansi-bold" style="color: rgb(175,95,135)">{</span>orient<span class="ansi-bold" style="color: rgb(175,95,135)">}</span><span style="color: rgb(175,0,0)">&#39;</span><span style="color: rgb(175,0,0)"> instead</span><span style="color: rgb(175,0,0)">&#34;</span>
<span class="ansi-green-intense-fg ansi-bold"> 1813</span> )
<span class="ansi-green-intense-fg ansi-bold"> 1815</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> orient <span style="color: rgb(98,98,98)">!=</span> <span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">tight</span><span style="color: rgb(175,0,0)">&#34;</span>:
<span class="ansi-green-fg">-&gt; 1816</span> <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">cls</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">data</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">index</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">index</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">columns</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">columns</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">dtype</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">dtype</span><span class="ansi-yellow-bg">)</span>
<span class="ansi-green-intense-fg ansi-bold"> 1817</span> <span class="ansi-bold" style="color: rgb(0,135,0)">else</span>:
<span class="ansi-green-intense-fg ansi-bold"> 1818</span> realdata <span style="color: rgb(98,98,98)">=</span> data[<span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">data</span><span style="color: rgb(175,0,0)">&#34;</span>]
File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages/pandas/core/frame.py:736</span>, in <span class="ansi-cyan-fg">DataFrame.__init__</span><span class="ansi-blue-fg">(self, data, index, columns, dtype, copy)</span>
<span class="ansi-green-intense-fg ansi-bold"> 730</span> mgr <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_init_mgr(
<span class="ansi-green-intense-fg ansi-bold"> 731</span> data, axes<span style="color: rgb(98,98,98)">=</span>{<span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">index</span><span style="color: rgb(175,0,0)">&#34;</span>: index, <span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">columns</span><span style="color: rgb(175,0,0)">&#34;</span>: columns}, dtype<span style="color: rgb(98,98,98)">=</span>dtype, copy<span style="color: rgb(98,98,98)">=</span>copy
<span class="ansi-green-intense-fg ansi-bold"> 732</span> )
<span class="ansi-green-intense-fg ansi-bold"> 734</span> <span class="ansi-bold" style="color: rgb(0,135,0)">elif</span> <span style="color: rgb(0,135,0)">isinstance</span>(data, <span style="color: rgb(0,135,0)">dict</span>):
<span class="ansi-green-intense-fg ansi-bold"> 735</span> <span style="color: rgb(95,135,135)"># GH#38939 de facto copy defaults to False only in non-dict cases</span>
<span class="ansi-green-fg">--&gt; 736</span> mgr <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">dict_to_mgr</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">data</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">index</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">columns</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">dtype</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">dtype</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">copy</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">copy</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">typ</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">manager</span><span class="ansi-yellow-bg">)</span>
<span class="ansi-green-intense-fg ansi-bold"> 737</span> <span class="ansi-bold" style="color: rgb(0,135,0)">elif</span> <span style="color: rgb(0,135,0)">isinstance</span>(data, ma<span style="color: rgb(98,98,98)">.</span>MaskedArray):
<span class="ansi-green-intense-fg ansi-bold"> 738</span> <span class="ansi-bold" style="color: rgb(0,135,0)">from</span> <span class="ansi-bold" style="color: rgb(0,0,255)">numpy</span><span class="ansi-bold" style="color: rgb(0,0,255)">.</span><span class="ansi-bold" style="color: rgb(0,0,255)">ma</span> <span class="ansi-bold" style="color: rgb(0,135,0)">import</span> mrecords
File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages/pandas/core/internals/construction.py:503</span>, in <span class="ansi-cyan-fg">dict_to_mgr</span><span class="ansi-blue-fg">(data, index, columns, dtype, typ, copy)</span>
<span class="ansi-green-intense-fg ansi-bold"> 499</span> <span class="ansi-bold" style="color: rgb(0,135,0)">else</span>:
<span class="ansi-green-intense-fg ansi-bold"> 500</span> <span style="color: rgb(95,135,135)"># dtype check to exclude e.g. range objects, scalars</span>
<span class="ansi-green-intense-fg ansi-bold"> 501</span> arrays <span style="color: rgb(98,98,98)">=</span> [x<span style="color: rgb(98,98,98)">.</span>copy() <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span style="color: rgb(0,135,0)">hasattr</span>(x, <span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">dtype</span><span style="color: rgb(175,0,0)">&#34;</span>) <span class="ansi-bold" style="color: rgb(0,135,0)">else</span> x <span class="ansi-bold" style="color: rgb(0,135,0)">for</span> x <span class="ansi-bold" style="color: rgb(175,0,255)">in</span> arrays]
<span class="ansi-green-fg">--&gt; 503</span> <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg">arrays_to_mgr</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">arrays</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">columns</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">index</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">dtype</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">dtype</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">typ</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">typ</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">consolidate</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">copy</span><span class="ansi-yellow-bg">)</span>
File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages/pandas/core/internals/construction.py:114</span>, in <span class="ansi-cyan-fg">arrays_to_mgr</span><span class="ansi-blue-fg">(arrays, columns, index, dtype, verify_integrity, typ, consolidate)</span>
<span class="ansi-green-intense-fg ansi-bold"> 111</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> verify_integrity:
<span class="ansi-green-intense-fg ansi-bold"> 112</span> <span style="color: rgb(95,135,135)"># figure out the index, if necessary</span>
<span class="ansi-green-intense-fg ansi-bold"> 113</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> index <span class="ansi-bold" style="color: rgb(175,0,255)">is</span> <span class="ansi-bold" style="color: rgb(0,135,0)">None</span>:
<span class="ansi-green-fg">--&gt; 114</span> index <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">_extract_index</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">arrays</span><span class="ansi-yellow-bg">)</span>
<span class="ansi-green-intense-fg ansi-bold"> 115</span> <span class="ansi-bold" style="color: rgb(0,135,0)">else</span>:
<span class="ansi-green-intense-fg ansi-bold"> 116</span> index <span style="color: rgb(98,98,98)">=</span> ensure_index(index)
File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.18/x64/lib/python3.9/site-packages/pandas/core/internals/construction.py:677</span>, in <span class="ansi-cyan-fg">_extract_index</span><span class="ansi-blue-fg">(data)</span>
<span class="ansi-green-intense-fg ansi-bold"> 675</span> lengths <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(0,135,0)">list</span>(<span style="color: rgb(0,135,0)">set</span>(raw_lengths))
<span class="ansi-green-intense-fg ansi-bold"> 676</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span style="color: rgb(0,135,0)">len</span>(lengths) <span style="color: rgb(98,98,98)">&gt;</span> <span style="color: rgb(98,98,98)">1</span>:
<span class="ansi-green-fg">--&gt; 677</span> <span class="ansi-bold" style="color: rgb(0,135,0)">raise</span> <span class="ansi-bold" style="color: rgb(215,95,95)">ValueError</span>(<span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">All arrays must be of the same length</span><span style="color: rgb(175,0,0)">&#34;</span>)
<span class="ansi-green-intense-fg ansi-bold"> 679</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> have_dicts:
<span class="ansi-green-intense-fg ansi-bold"> 680</span> <span class="ansi-bold" style="color: rgb(0,135,0)">raise</span> <span class="ansi-bold" style="color: rgb(215,95,95)">ValueError</span>(
<span class="ansi-green-intense-fg ansi-bold"> 681</span> <span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">Mixing dicts with non-Series may lead to ambiguous ordering.</span><span style="color: rgb(175,0,0)">&#34;</span>
<span class="ansi-green-intense-fg ansi-bold"> 682</span> )
<span class="ansi-red-fg">ValueError</span>: All arrays must be of the same length
</pre></div></div>
</div>
<p>Check the dataframe:</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[8]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
<span class="ansi-red-fg">---------------------------------------------------------------------------</span>
<span class="ansi-red-fg">NameError</span> Traceback (most recent call last)
Cell <span class="ansi-green-fg">In[8], line 1</span>
<span class="ansi-green-fg">----&gt; 1</span> <span class="ansi-yellow-bg">df</span><span style="color: rgb(98,98,98)">.</span>head(<span style="color: rgb(98,98,98)">10</span>)
<span class="ansi-red-fg">NameError</span>: name &#39;df&#39; is not defined
</pre></div></div>
</div>
<p>Write the csv file - here you should provide a file path and file name for the csv file to be written.</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[9]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Write the csv</span>
<span class="n">df</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="s2">&quot;./data_out.csv&quot;</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
<span class="ansi-red-fg">---------------------------------------------------------------------------</span>
<span class="ansi-red-fg">NameError</span> Traceback (most recent call last)
Cell <span class="ansi-green-fg">In[9], line 2</span>
<span class="ansi-green-intense-fg ansi-bold"> 1</span> <span style="color: rgb(95,135,135)"># Write the csv</span>
<span class="ansi-green-fg">----&gt; 2</span> <span class="ansi-yellow-bg">df</span><span style="color: rgb(98,98,98)">.</span>to_csv(<span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">./data_out.csv</span><span style="color: rgb(175,0,0)">&#34;</span>)
<span class="ansi-red-fg">NameError</span>: name &#39;df&#39; is not defined
</pre></div></div>
</div>
</section>
<section id="Topic-analysis">
<h2>Topic analysis<a class="headerlink" href="#Topic-analysis" title="Link to this heading"></a></h2>
<p>The topic analysis is carried out using <a class="reference external" href="https://maartengr.github.io/BERTopic/index.html">BERTopic</a> using an embedded model through a <a class="reference external" href="https://spacy.io/">spaCy</a> pipeline.</p>
<p>BERTopic takes a list of strings as input. The more items in the list, the better for the topic modeling. If the below returns an error for <code class="docutils literal notranslate"><span class="pre">analyse_topic()</span></code>, the reason can be that your dataset is too small.</p>
<p>You can pass which dataframe entry you would like to have analyzed. The default is <code class="docutils literal notranslate"><span class="pre">text_english</span></code>, but you could for example also select <code class="docutils literal notranslate"><span class="pre">text_summary</span></code> or <code class="docutils literal notranslate"><span class="pre">text_english_correct</span></code> setting the keyword <code class="docutils literal notranslate"><span class="pre">analyze_text</span></code> as so:</p>
<p><code class="docutils literal notranslate"><span class="pre">ammico.text.PostprocessText(mydict=mydict,</span> <span class="pre">analyze_text=&quot;text_summary&quot;).analyse_topic()</span></code></p>
<section id="Option-1:-Use-the-dictionary-as-obtained-from-the-above-analysis.">
<h3>Option 1: Use the dictionary as obtained from the above analysis.<a class="headerlink" href="#Option-1:-Use-the-dictionary-as-obtained-from-the-above-analysis." title="Link to this heading"></a></h3>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[10]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># make a list of all the text_english entries per analysed image from the mydict variable as above</span>
<span class="n">topic_model</span><span class="p">,</span> <span class="n">topic_df</span><span class="p">,</span> <span class="n">most_frequent_topics</span> <span class="o">=</span> <span class="n">ammico</span><span class="o">.</span><span class="n">text</span><span class="o">.</span><span class="n">PostprocessText</span><span class="p">(</span>
<span class="n">mydict</span><span class="o">=</span><span class="n">mydict</span>
<span class="p">)</span><span class="o">.</span><span class="n">analyse_topic</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
Reading data from dict.
</pre></div></div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
<span class="ansi-red-fg">---------------------------------------------------------------------------</span>
<span class="ansi-red-fg">ValueError</span> Traceback (most recent call last)
Cell <span class="ansi-green-fg">In[10], line 2</span>
<span class="ansi-green-intense-fg ansi-bold"> 1</span> <span style="color: rgb(95,135,135)"># make a list of all the text_english entries per analysed image from the mydict variable as above</span>
<span class="ansi-green-fg">----&gt; 2</span> topic_model, topic_df, most_frequent_topics <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">ammico</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">text</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">PostprocessText</span><span class="ansi-yellow-bg">(</span>
<span class="ansi-green-intense-fg ansi-bold"> 3</span> <span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">mydict</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">mydict</span>
<span class="ansi-green-intense-fg ansi-bold"> 4</span> <span class="ansi-yellow-bg">)</span><span style="color: rgb(98,98,98)">.</span>analyse_topic()
File <span class="ansi-green-fg">~/work/AMMICO/AMMICO/ammico/text.py:303</span>, in <span class="ansi-cyan-fg">PostprocessText.__init__</span><span class="ansi-blue-fg">(self, mydict, use_csv, csv_path, analyze_text)</span>
<span class="ansi-green-intense-fg ansi-bold"> 301</span> <span style="color: rgb(0,135,0)">print</span>(<span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">Reading data from dict.</span><span style="color: rgb(175,0,0)">&#34;</span>)
<span class="ansi-green-intense-fg ansi-bold"> 302</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>mydict <span style="color: rgb(98,98,98)">=</span> mydict
<span class="ansi-green-fg">--&gt; 303</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>list_text_english <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">get_text_dict</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">analyze_text</span><span class="ansi-yellow-bg">)</span>
<span class="ansi-green-intense-fg ansi-bold"> 304</span> <span class="ansi-bold" style="color: rgb(0,135,0)">elif</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>use_csv:
<span class="ansi-green-intense-fg ansi-bold"> 305</span> <span style="color: rgb(0,135,0)">print</span>(<span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">Reading data from df.</span><span style="color: rgb(175,0,0)">&#34;</span>)
File <span class="ansi-green-fg">~/work/AMMICO/AMMICO/ammico/text.py:375</span>, in <span class="ansi-cyan-fg">PostprocessText.get_text_dict</span><span class="ansi-blue-fg">(self, analyze_text)</span>
<span class="ansi-green-intense-fg ansi-bold"> 373</span> <span class="ansi-bold" style="color: rgb(0,135,0)">for</span> key <span class="ansi-bold" style="color: rgb(175,0,255)">in</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>mydict<span style="color: rgb(98,98,98)">.</span>keys():
<span class="ansi-green-intense-fg ansi-bold"> 374</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> analyze_text <span class="ansi-bold" style="color: rgb(175,0,255)">not</span> <span class="ansi-bold" style="color: rgb(175,0,255)">in</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>mydict[key]:
<span class="ansi-green-fg">--&gt; 375</span> <span class="ansi-bold" style="color: rgb(0,135,0)">raise</span> <span class="ansi-bold" style="color: rgb(215,95,95)">ValueError</span>(
<span class="ansi-green-intense-fg ansi-bold"> 376</span> <span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">Please check your provided dictionary - </span><span class="ansi-bold" style="color: rgb(175,95,0)">\</span>
<span class="ansi-green-intense-fg ansi-bold"> 377</span> <span style="color: rgb(175,0,0)"> no </span><span class="ansi-bold" style="color: rgb(175,95,135)">{}</span><span style="color: rgb(175,0,0)"> text data found.</span><span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(98,98,98)">.</span>format(
<span class="ansi-green-intense-fg ansi-bold"> 378</span> analyze_text
<span class="ansi-green-intense-fg ansi-bold"> 379</span> )
<span class="ansi-green-intense-fg ansi-bold"> 380</span> )
<span class="ansi-green-intense-fg ansi-bold"> 381</span> list_text_english<span style="color: rgb(98,98,98)">.</span>append(<span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>mydict[key][analyze_text])
<span class="ansi-green-intense-fg ansi-bold"> 382</span> <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> list_text_english
<span class="ansi-red-fg">ValueError</span>: Please check your provided dictionary - no text_english text data found.
</pre></div></div>
</div>
</section>
<section id="Option-2:-Read-in-a-csv">
<h3>Option 2: Read in a csv<a class="headerlink" href="#Option-2:-Read-in-a-csv" title="Link to this heading"></a></h3>
<p>Not to analyse too many images on google Cloud Vision, use the csv output to obtain the text (when rerunning already analysed images).</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[11]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">input_file_path</span> <span class="o">=</span> <span class="s2">&quot;data_out.csv&quot;</span>
<span class="n">topic_model</span><span class="p">,</span> <span class="n">topic_df</span><span class="p">,</span> <span class="n">most_frequent_topics</span> <span class="o">=</span> <span class="n">ammico</span><span class="o">.</span><span class="n">text</span><span class="o">.</span><span class="n">PostprocessText</span><span class="p">(</span>
<span class="n">use_csv</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">csv_path</span><span class="o">=</span><span class="n">input_file_path</span>
<span class="p">)</span><span class="o">.</span><span class="n">analyse_topic</span><span class="p">(</span><span class="n">return_topics</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
Reading data from df.
</pre></div></div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
<span class="ansi-red-fg">---------------------------------------------------------------------------</span>
<span class="ansi-red-fg">ValueError</span> Traceback (most recent call last)
Cell <span class="ansi-green-fg">In[11], line 2</span>
<span class="ansi-green-intense-fg ansi-bold"> 1</span> input_file_path <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">data_out.csv</span><span style="color: rgb(175,0,0)">&#34;</span>
<span class="ansi-green-fg">----&gt; 2</span> topic_model, topic_df, most_frequent_topics <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">ammico</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">text</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">PostprocessText</span><span class="ansi-yellow-bg">(</span>
<span class="ansi-green-intense-fg ansi-bold"> 3</span> <span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">use_csv</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg ansi-bold" style="color: rgb(0,135,0)">True</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">csv_path</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">input_file_path</span>
<span class="ansi-green-intense-fg ansi-bold"> 4</span> <span class="ansi-yellow-bg">)</span><span style="color: rgb(98,98,98)">.</span>analyse_topic(return_topics<span style="color: rgb(98,98,98)">=</span><span style="color: rgb(98,98,98)">10</span>)
File <span class="ansi-green-fg">~/work/AMMICO/AMMICO/ammico/text.py:307</span>, in <span class="ansi-cyan-fg">PostprocessText.__init__</span><span class="ansi-blue-fg">(self, mydict, use_csv, csv_path, analyze_text)</span>
<span class="ansi-green-intense-fg ansi-bold"> 305</span> <span style="color: rgb(0,135,0)">print</span>(<span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">Reading data from df.</span><span style="color: rgb(175,0,0)">&#34;</span>)
<span class="ansi-green-intense-fg ansi-bold"> 306</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>df <span style="color: rgb(98,98,98)">=</span> pd<span style="color: rgb(98,98,98)">.</span>read_csv(csv_path, encoding<span style="color: rgb(98,98,98)">=</span><span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">utf8</span><span style="color: rgb(175,0,0)">&#34;</span>)
<span class="ansi-green-fg">--&gt; 307</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>list_text_english <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">get_text_df</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">analyze_text</span><span class="ansi-yellow-bg">)</span>
<span class="ansi-green-intense-fg ansi-bold"> 308</span> <span class="ansi-bold" style="color: rgb(0,135,0)">else</span>:
<span class="ansi-green-intense-fg ansi-bold"> 309</span> <span class="ansi-bold" style="color: rgb(0,135,0)">raise</span> <span class="ansi-bold" style="color: rgb(215,95,95)">ValueError</span>(
<span class="ansi-green-intense-fg ansi-bold"> 310</span> <span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">Please provide either dictionary with textual data or </span><span class="ansi-bold" style="color: rgb(175,95,0)">\</span>
<span class="ansi-green-intense-fg ansi-bold"> 311</span> <span style="color: rgb(175,0,0)"> a csv file by setting `use_csv` to True and providing a </span><span class="ansi-bold" style="color: rgb(175,95,0)">\</span>
<span class="ansi-green-intense-fg ansi-bold"> 312</span> <span style="color: rgb(175,0,0)"> `csv_path`.</span><span style="color: rgb(175,0,0)">&#34;</span>
<span class="ansi-green-intense-fg ansi-bold"> 313</span> )
File <span class="ansi-green-fg">~/work/AMMICO/AMMICO/ammico/text.py:397</span>, in <span class="ansi-cyan-fg">PostprocessText.get_text_df</span><span class="ansi-blue-fg">(self, analyze_text)</span>
<span class="ansi-green-intense-fg ansi-bold"> 394</span> <span style="color: rgb(95,135,135)"># use csv file to obtain dataframe and put text_english or text_summary in list</span>
<span class="ansi-green-intense-fg ansi-bold"> 395</span> <span style="color: rgb(95,135,135)"># check that &#34;text_english&#34; or &#34;text_summary&#34; is there</span>
<span class="ansi-green-intense-fg ansi-bold"> 396</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> analyze_text <span class="ansi-bold" style="color: rgb(175,0,255)">not</span> <span class="ansi-bold" style="color: rgb(175,0,255)">in</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>df:
<span class="ansi-green-fg">--&gt; 397</span> <span class="ansi-bold" style="color: rgb(0,135,0)">raise</span> <span class="ansi-bold" style="color: rgb(215,95,95)">ValueError</span>(
<span class="ansi-green-intense-fg ansi-bold"> 398</span> <span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">Please check your provided dataframe - </span><span class="ansi-bold" style="color: rgb(175,95,0)">\</span>
<span class="ansi-green-intense-fg ansi-bold"> 399</span> <span style="color: rgb(175,0,0)"> no </span><span class="ansi-bold" style="color: rgb(175,95,135)">{}</span><span style="color: rgb(175,0,0)"> text data found.</span><span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(98,98,98)">.</span>format(
<span class="ansi-green-intense-fg ansi-bold"> 400</span> analyze_text
<span class="ansi-green-intense-fg ansi-bold"> 401</span> )
<span class="ansi-green-intense-fg ansi-bold"> 402</span> )
<span class="ansi-green-intense-fg ansi-bold"> 403</span> <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>df[analyze_text]<span style="color: rgb(98,98,98)">.</span>tolist()
<span class="ansi-red-fg">ValueError</span>: Please check your provided dataframe - no text_english text data found.
</pre></div></div>
</div>
</section>
<section id="Access-frequent-topics">
<h3>Access frequent topics<a class="headerlink" href="#Access-frequent-topics" title="Link to this heading"></a></h3>
<p>A topic of <code class="docutils literal notranslate"><span class="pre">-1</span></code> stands for an outlier and should be ignored. Topic count is the number of occurence of that topic. The output is structured from most frequent to least frequent topic.</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[12]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="nb">print</span><span class="p">(</span><span class="n">topic_df</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
<span class="ansi-red-fg">---------------------------------------------------------------------------</span>
<span class="ansi-red-fg">NameError</span> Traceback (most recent call last)
Cell <span class="ansi-green-fg">In[12], line 1</span>
<span class="ansi-green-fg">----&gt; 1</span> <span style="color: rgb(0,135,0)">print</span>(<span class="ansi-yellow-bg">topic_df</span>)
<span class="ansi-red-fg">NameError</span>: name &#39;topic_df&#39; is not defined
</pre></div></div>
</div>
</section>
<section id="Get-information-for-specific-topic">
<h3>Get information for specific topic<a class="headerlink" href="#Get-information-for-specific-topic" title="Link to this heading"></a></h3>
<p>The most frequent topics can be accessed through <code class="docutils literal notranslate"><span class="pre">most_frequent_topics</span></code> with the most occuring topics first in the list.</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[13]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">for</span> <span class="n">topic</span> <span class="ow">in</span> <span class="n">most_frequent_topics</span><span class="p">:</span>
<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Topic:&quot;</span><span class="p">,</span> <span class="n">topic</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
<span class="ansi-red-fg">---------------------------------------------------------------------------</span>
<span class="ansi-red-fg">NameError</span> Traceback (most recent call last)
Cell <span class="ansi-green-fg">In[13], line 1</span>
<span class="ansi-green-fg">----&gt; 1</span> <span class="ansi-bold" style="color: rgb(0,135,0)">for</span> topic <span class="ansi-bold" style="color: rgb(175,0,255)">in</span> <span class="ansi-yellow-bg">most_frequent_topics</span>:
<span class="ansi-green-intense-fg ansi-bold"> 2</span> <span style="color: rgb(0,135,0)">print</span>(<span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">Topic:</span><span style="color: rgb(175,0,0)">&#34;</span>, topic)
<span class="ansi-red-fg">NameError</span>: name &#39;most_frequent_topics&#39; is not defined
</pre></div></div>
</div>
</section>
<section id="Topic-visualization">
<h3>Topic visualization<a class="headerlink" href="#Topic-visualization" title="Link to this heading"></a></h3>
<p>The topics can also be visualized. Careful: This only works if there is sufficient data (quantity and quality).</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[14]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">topic_model</span><span class="o">.</span><span class="n">visualize_topics</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
<span class="ansi-red-fg">---------------------------------------------------------------------------</span>
<span class="ansi-red-fg">NameError</span> Traceback (most recent call last)
Cell <span class="ansi-green-fg">In[14], line 1</span>
<span class="ansi-green-fg">----&gt; 1</span> <span class="ansi-yellow-bg">topic_model</span><span style="color: rgb(98,98,98)">.</span>visualize_topics()
<span class="ansi-red-fg">NameError</span>: name &#39;topic_model&#39; is not defined
</pre></div></div>
</div>
</section>
<section id="Save-the-model">
<h3>Save the model<a class="headerlink" href="#Save-the-model" title="Link to this heading"></a></h3>
<p>The model can be saved for future use.</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[15]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">topic_model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="s2">&quot;misinfo_posts&quot;</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
<span class="ansi-red-fg">---------------------------------------------------------------------------</span>
<span class="ansi-red-fg">NameError</span> Traceback (most recent call last)
Cell <span class="ansi-green-fg">In[15], line 1</span>
<span class="ansi-green-fg">----&gt; 1</span> <span class="ansi-yellow-bg">topic_model</span><span style="color: rgb(98,98,98)">.</span>save(<span style="color: rgb(175,0,0)">&#34;</span><span style="color: rgb(175,0,0)">misinfo_posts</span><span style="color: rgb(175,0,0)">&#34;</span>)
<span class="ansi-red-fg">NameError</span>: name &#39;topic_model&#39; is not defined
</pre></div></div>
</div>
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[ ]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span>
</pre></div>
</div>
</div>
</section>
</section>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="Example%20faces.html" class="btn btn-neutral float-left" title="Facial Expression recognition with DeepFace" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="Example%20summary.html" class="btn btn-neutral float-right" title="Image summary and visual question answering" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<p>&#169; Copyright 2022, Scientific Software Center, Heidelberg University.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>