зеркало из
				https://github.com/ssciwr/AMMICO.git
				synced 2025-10-31 22:16:05 +02:00 
			
		
		
		
	
		
			
				
	
	
		
			1037 строки
		
	
	
		
			154 KiB
		
	
	
	
		
			HTML
		
	
	
	
	
	
			
		
		
	
	
			1037 строки
		
	
	
		
			154 KiB
		
	
	
	
		
			HTML
		
	
	
	
	
	
| <!DOCTYPE html>
 | |
| <html class="writer-html5" lang="en" >
 | |
| <head>
 | |
|   <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
 | |
| 
 | |
|   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
 | |
|   <title>Notebook for text extraction on image — AMMICO 0.0.1 documentation</title>
 | |
|       <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
 | |
|       <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
 | |
|       <link rel="stylesheet" href="../_static/nbsphinx-code-cells.css" type="text/css" />
 | |
|   <!--[if lt IE 9]>
 | |
|     <script src="../_static/js/html5shiv.min.js"></script>
 | |
|   <![endif]-->
 | |
|   
 | |
|         <script src="../_static/jquery.js"></script>
 | |
|         <script src="../_static/_sphinx_javascript_frameworks_compat.js"></script>
 | |
|         <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
 | |
|         <script src="../_static/doctools.js"></script>
 | |
|         <script src="../_static/sphinx_highlight.js"></script>
 | |
|         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
 | |
|         <script>window.MathJax = {"tex": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true}, "options": {"ignoreHtmlClass": "tex2jax_ignore|mathjax_ignore|document", "processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script>
 | |
|         <script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
 | |
|     <script src="../_static/js/theme.js"></script>
 | |
|     <link rel="index" title="Index" href="../genindex.html" />
 | |
|     <link rel="search" title="Search" href="../search.html" />
 | |
|     <link rel="next" title="Image summary and visual question answering" href="Example%20summary.html" />
 | |
|     <link rel="prev" title="Facial Expression recognition with DeepFace" href="Example%20faces.html" /> 
 | |
| </head>
 | |
| 
 | |
| <body class="wy-body-for-nav"> 
 | |
|   <div class="wy-grid-for-nav">
 | |
|     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 | |
|       <div class="wy-side-scroll">
 | |
|         <div class="wy-side-nav-search" >
 | |
| 
 | |
|           
 | |
|           
 | |
|           <a href="../index.html" class="icon icon-home">
 | |
|             AMMICO
 | |
|           </a>
 | |
| <div role="search">
 | |
|   <form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
 | |
|     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 | |
|     <input type="hidden" name="check_keywords" value="yes" />
 | |
|     <input type="hidden" name="area" value="default" />
 | |
|   </form>
 | |
| </div>
 | |
|         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 | |
|               <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
 | |
| <ul class="current">
 | |
| <li class="toctree-l1"><a class="reference internal" href="../readme_link.html">AMMICO - AI Media and Misinformation Content Analysis Tool</a></li>
 | |
| <li class="toctree-l1"><a class="reference internal" href="Example%20faces.html">Facial Expression recognition with DeepFace</a></li>
 | |
| <li class="toctree-l1 current"><a class="current reference internal" href="#">Notebook for text extraction on image</a><ul>
 | |
| <li class="toctree-l2"><a class="reference internal" href="#Google-cloud-vision-API">Google cloud vision API</a></li>
 | |
| <li class="toctree-l2"><a class="reference internal" href="#Inspect-the-elements-per-image">Inspect the elements per image</a></li>
 | |
| <li class="toctree-l2"><a class="reference internal" href="#Or-directly-analyze-for-further-processing">Or directly analyze for further processing</a></li>
 | |
| <li class="toctree-l2"><a class="reference internal" href="#Convert-to-dataframe-and-write-csv">Convert to dataframe and write csv</a></li>
 | |
| <li class="toctree-l2"><a class="reference internal" href="#Topic-analysis">Topic analysis</a><ul>
 | |
| <li class="toctree-l3"><a class="reference internal" href="#Option-1:-Use-the-dictionary-as-obtained-from-the-above-analysis.">Option 1: Use the dictionary as obtained from the above analysis.</a></li>
 | |
| <li class="toctree-l3"><a class="reference internal" href="#Option-2:-Read-in-a-csv">Option 2: Read in a csv</a></li>
 | |
| <li class="toctree-l3"><a class="reference internal" href="#Access-frequent-topics">Access frequent topics</a></li>
 | |
| <li class="toctree-l3"><a class="reference internal" href="#Get-information-for-specific-topic">Get information for specific topic</a></li>
 | |
| <li class="toctree-l3"><a class="reference internal" href="#Topic-visualization">Topic visualization</a></li>
 | |
| <li class="toctree-l3"><a class="reference internal" href="#Save-the-model">Save the model</a></li>
 | |
| </ul>
 | |
| </li>
 | |
| </ul>
 | |
| </li>
 | |
| <li class="toctree-l1"><a class="reference internal" href="Example%20summary.html">Image summary and visual question answering</a></li>
 | |
| <li class="toctree-l1"><a class="reference internal" href="Example%20multimodal.html">Image Multimodal Search</a></li>
 | |
| <li class="toctree-l1"><a class="reference internal" href="Example%20colors.html">Color analysis of pictures</a></li>
 | |
| <li class="toctree-l1"><a class="reference internal" href="Example%20objects.html">Objects recognition</a></li>
 | |
| <li class="toctree-l1"><a class="reference internal" href="Example%20cropposts.html">Crop posts from social media posts images</a></li>
 | |
| <li class="toctree-l1"><a class="reference internal" href="../modules.html">AMMICO package modules</a></li>
 | |
| <li class="toctree-l1"><a class="reference internal" href="../license_link.html">License</a></li>
 | |
| </ul>
 | |
| 
 | |
|         </div>
 | |
|       </div>
 | |
|     </nav>
 | |
| 
 | |
|     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 | |
|           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 | |
|           <a href="../index.html">AMMICO</a>
 | |
|       </nav>
 | |
| 
 | |
|       <div class="wy-nav-content">
 | |
|         <div class="rst-content">
 | |
|           <div role="navigation" aria-label="Page navigation">
 | |
|   <ul class="wy-breadcrumbs">
 | |
|       <li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
 | |
|       <li class="breadcrumb-item active">Notebook for text extraction on image</li>
 | |
|       <li class="wy-breadcrumbs-aside">
 | |
|             <a href="../_sources/notebooks/Example text.ipynb.txt" rel="nofollow"> View page source</a>
 | |
|       </li>
 | |
|   </ul>
 | |
|   <hr/>
 | |
| </div>
 | |
|           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 | |
|            <div itemprop="articleBody">
 | |
|              
 | |
|   <section id="Notebook-for-text-extraction-on-image">
 | |
| <h1>Notebook for text extraction on image<a class="headerlink" href="#Notebook-for-text-extraction-on-image" title="Permalink to this heading"></a></h1>
 | |
| <p>The text extraction and analysis is carried out using a variety of tools:</p>
 | |
| <ol class="arabic simple">
 | |
| <li><p>Text extraction from the image using <a class="reference external" href="https://cloud.google.com/vision">google-cloud-vision</a></p></li>
 | |
| <li><p>Language detection of the extracted text using <a class="reference external" href="https://py-googletrans.readthedocs.io/en/latest/">Googletrans</a></p></li>
 | |
| <li><p>Translation into English or other languages using <a class="reference external" href="https://py-googletrans.readthedocs.io/en/latest/">Googletrans</a></p></li>
 | |
| <li><p>Cleaning of the text using <a class="reference external" href="https://spacy.io/">spacy</a></p></li>
 | |
| <li><p>Spell-check using <a class="reference external" href="https://textblob.readthedocs.io/en/dev/index.html">TextBlob</a></p></li>
 | |
| <li><p>Subjectivity analysis using <a class="reference external" href="https://textblob.readthedocs.io/en/dev/index.html">TextBlob</a></p></li>
 | |
| <li><p>Text summarization using <a class="reference external" href="https://huggingface.co/docs/transformers/index">transformers</a> pipelines</p></li>
 | |
| <li><p>Sentiment analysis using <a class="reference external" href="https://huggingface.co/docs/transformers/index">transformers</a> pipelines</p></li>
 | |
| <li><p>Named entity recognition using <a class="reference external" href="https://huggingface.co/docs/transformers/index">transformers</a> pipelines</p></li>
 | |
| <li><p>Topic analysis using <a class="reference external" href="https://github.com/MaartenGr/BERTopic">BERTopic</a></p></li>
 | |
| </ol>
 | |
| <p>The first cell is only run on google colab and installs the <a class="reference external" href="https://github.com/ssciwr/AMMICO">ammico</a> package.</p>
 | |
| <p>After that, we can import <code class="docutils literal notranslate"><span class="pre">ammico</span></code> and read in the files given a folder path.</p>
 | |
| <div class="nbinput nblast docutils container">
 | |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[1]:
 | |
| </pre></div>
 | |
| </div>
 | |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># if running on google colab</span>
 | |
| <span class="c1"># flake8-noqa-cell</span>
 | |
| <span class="kn">import</span> <span class="nn">os</span>
 | |
| 
 | |
| <span class="k">if</span> <span class="s2">"google.colab"</span> <span class="ow">in</span> <span class="nb">str</span><span class="p">(</span><span class="n">get_ipython</span><span class="p">()):</span>
 | |
|     <span class="c1"># update python version</span>
 | |
|     <span class="c1"># install setuptools</span>
 | |
|     <span class="c1"># %pip install setuptools==61 -qqq</span>
 | |
|     <span class="c1"># install ammico</span>
 | |
|     <span class="o">%</span><span class="k">pip</span> install git+https://github.com/ssciwr/ammico.git -qqq
 | |
|     <span class="c1"># mount google drive for data and API key</span>
 | |
|     <span class="kn">from</span> <span class="nn">google.colab</span> <span class="kn">import</span> <span class="n">drive</span>
 | |
| 
 | |
|     <span class="n">drive</span><span class="o">.</span><span class="n">mount</span><span class="p">(</span><span class="s2">"/content/drive"</span><span class="p">)</span>
 | |
| </pre></div>
 | |
| </div>
 | |
| </div>
 | |
| <div class="nbinput nblast docutils container">
 | |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]:
 | |
| </pre></div>
 | |
| </div>
 | |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">os</span>
 | |
| <span class="kn">import</span> <span class="nn">ammico</span>
 | |
| <span class="kn">from</span> <span class="nn">ammico</span> <span class="kn">import</span> <span class="n">utils</span> <span class="k">as</span> <span class="n">mutils</span>
 | |
| <span class="kn">from</span> <span class="nn">ammico</span> <span class="kn">import</span> <span class="n">display</span> <span class="k">as</span> <span class="n">mdisplay</span>
 | |
| </pre></div>
 | |
| </div>
 | |
| </div>
 | |
| <p>We select a subset of image files to try the text extraction on, see the <code class="docutils literal notranslate"><span class="pre">limit</span></code> keyword. The <code class="docutils literal notranslate"><span class="pre">find_files</span></code> function finds image files within a given directory:</p>
 | |
| <div class="nbinput nblast docutils container">
 | |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[3]:
 | |
| </pre></div>
 | |
| </div>
 | |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Here you need to provide the path to your google drive folder</span>
 | |
| <span class="c1"># or local folder containing the images</span>
 | |
| <span class="n">images</span> <span class="o">=</span> <span class="n">mutils</span><span class="o">.</span><span class="n">find_files</span><span class="p">(</span>
 | |
|     <span class="n">path</span><span class="o">=</span><span class="s2">"data/"</span><span class="p">,</span>
 | |
|     <span class="n">limit</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span>
 | |
| <span class="p">)</span>
 | |
| </pre></div>
 | |
| </div>
 | |
| </div>
 | |
| <p>We need to initialize the main dictionary that contains all information for the images and is updated through each subsequent analysis:</p>
 | |
| <div class="nbinput nblast docutils container">
 | |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[4]:
 | |
| </pre></div>
 | |
| </div>
 | |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">mydict</span> <span class="o">=</span> <span class="n">mutils</span><span class="o">.</span><span class="n">initialize_dict</span><span class="p">(</span><span class="n">images</span><span class="p">)</span>
 | |
| </pre></div>
 | |
| </div>
 | |
| </div>
 | |
| <section id="Google-cloud-vision-API">
 | |
| <h2>Google cloud vision API<a class="headerlink" href="#Google-cloud-vision-API" title="Permalink to this heading"></a></h2>
 | |
| <p>For this you need an API key and have the app activated in your google console. The first 1000 images per month are free (July 2022).</p>
 | |
| <div class="highlight-none notranslate"><div class="highlight"><pre><span></span>os.environ[
 | |
|     "GOOGLE_APPLICATION_CREDENTIALS"
 | |
| ] = "your-credentials.json"
 | |
| </pre></div>
 | |
| </div>
 | |
| </section>
 | |
| <section id="Inspect-the-elements-per-image">
 | |
| <h2>Inspect the elements per image<a class="headerlink" href="#Inspect-the-elements-per-image" title="Permalink to this heading"></a></h2>
 | |
| <p>To check the analysis, you can inspect the analyzed elements here. Loading the results takes a moment, so please be patient. If you are sure of what you are doing, you can skip this and directly export a csv file in the step below. Here, we display the text extraction and translation results provided by the above libraries. Click on the tabs to see the results in the right sidebar. You may need to increment the <code class="docutils literal notranslate"><span class="pre">port</span></code> number if you are already running several notebook instances on the same
 | |
| server.</p>
 | |
| <div class="nbinput docutils container">
 | |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[5]:
 | |
| </pre></div>
 | |
| </div>
 | |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">analysis_explorer</span> <span class="o">=</span> <span class="n">mdisplay</span><span class="o">.</span><span class="n">AnalysisExplorer</span><span class="p">(</span><span class="n">mydict</span><span class="p">,</span> <span class="n">identify</span><span class="o">=</span><span class="s2">"text-on-image"</span><span class="p">)</span>
 | |
| <span class="n">analysis_explorer</span><span class="o">.</span><span class="n">run_server</span><span class="p">(</span><span class="n">port</span><span class="o">=</span><span class="mi">8054</span><span class="p">)</span>
 | |
| </pre></div>
 | |
| </div>
 | |
| </div>
 | |
| <div class="nboutput nblast docutils container">
 | |
| <div class="prompt empty docutils container">
 | |
| </div>
 | |
| <div class="output_area docutils container">
 | |
| <div class="highlight"><pre>
 | |
| <span class="ansi-red-fg">---------------------------------------------------------------------------</span>
 | |
| <span class="ansi-red-fg">TypeError</span>                                 Traceback (most recent call last)
 | |
| Cell <span class="ansi-green-fg">In[5], line 1</span>
 | |
| <span class="ansi-green-fg">----> 1</span> analysis_explorer <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">mdisplay</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">AnalysisExplorer</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">mydict</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">identify</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">text-on-image</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">      2</span> analysis_explorer<span style="color: rgb(98,98,98)">.</span>run_server(port<span style="color: rgb(98,98,98)">=</span><span style="color: rgb(98,98,98)">8054</span>)
 | |
| 
 | |
| <span class="ansi-red-fg">TypeError</span>: __init__() got an unexpected keyword argument 'identify'
 | |
| </pre></div></div>
 | |
| </div>
 | |
| </section>
 | |
| <section id="Or-directly-analyze-for-further-processing">
 | |
| <h2>Or directly analyze for further processing<a class="headerlink" href="#Or-directly-analyze-for-further-processing" title="Permalink to this heading"></a></h2>
 | |
| <p>Instead of inspecting each of the images, you can also directly carry out the analysis and export the result into a csv. This may take a while depending on how many images you have loaded. Set the keyword <code class="docutils literal notranslate"><span class="pre">analyse_text</span></code> to <code class="docutils literal notranslate"><span class="pre">True</span></code> if you want the text to be analyzed (spell check, subjectivity, text summary, sentiment, NER).</p>
 | |
| <div class="nbinput docutils container">
 | |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[6]:
 | |
| </pre></div>
 | |
| </div>
 | |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">mydict</span><span class="p">:</span>
 | |
|     <span class="n">mydict</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">ammico</span><span class="o">.</span><span class="n">text</span><span class="o">.</span><span class="n">TextDetector</span><span class="p">(</span>
 | |
|         <span class="n">mydict</span><span class="p">[</span><span class="n">key</span><span class="p">],</span> <span class="n">analyse_text</span><span class="o">=</span><span class="kc">True</span>
 | |
|     <span class="p">)</span><span class="o">.</span><span class="n">analyse_image</span><span class="p">()</span>
 | |
| </pre></div>
 | |
| </div>
 | |
| </div>
 | |
| <div class="nboutput nblast docutils container">
 | |
| <div class="prompt empty docutils container">
 | |
| </div>
 | |
| <div class="output_area stderr docutils container">
 | |
| <div class="highlight"><pre>
 | |
| Downloading (…)/a4f8f3e/config.json: 100%|██████████| 1.80k/1.80k [00:00<00:00, 743kB/s]
 | |
| Downloading pytorch_model.bin: 100%|██████████| 1.22G/1.22G [01:04<00:00, 18.9MB/s]
 | |
| Downloading (…)okenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 22.4kB/s]
 | |
| Downloading (…)e/a4f8f3e/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 4.44MB/s]
 | |
| Downloading (…)e/a4f8f3e/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 3.87MB/s]
 | |
| Downloading (…)/af0f99b/config.json: 100%|██████████| 629/629 [00:00<00:00, 635kB/s]
 | |
| Downloading pytorch_model.bin: 100%|██████████| 268M/268M [00:08<00:00, 33.3MB/s]
 | |
| Downloading (…)okenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 46.0kB/s]
 | |
| Downloading (…)ve/af0f99b/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 11.2MB/s]
 | |
| Downloading (…)/f2482bf/config.json: 100%|██████████| 998/998 [00:00<00:00, 448kB/s]
 | |
| Downloading pytorch_model.bin: 100%|██████████| 1.33G/1.33G [00:44<00:00, 29.8MB/s]
 | |
| Downloading (…)okenizer_config.json: 100%|██████████| 60.0/60.0 [00:00<00:00, 58.7kB/s]
 | |
| Downloading (…)ve/f2482bf/vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 14.0MB/s]
 | |
| </pre></div></div>
 | |
| </div>
 | |
| </section>
 | |
| <section id="Convert-to-dataframe-and-write-csv">
 | |
| <h2>Convert to dataframe and write csv<a class="headerlink" href="#Convert-to-dataframe-and-write-csv" title="Permalink to this heading"></a></h2>
 | |
| <p>These steps are required to convert the dictionary of dictionarys into a dictionary with lists, that can be converted into a pandas dataframe and exported to a csv file.</p>
 | |
| <div class="nbinput nblast docutils container">
 | |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[7]:
 | |
| </pre></div>
 | |
| </div>
 | |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">outdict</span> <span class="o">=</span> <span class="n">mutils</span><span class="o">.</span><span class="n">append_data_to_dict</span><span class="p">(</span><span class="n">mydict</span><span class="p">)</span>
 | |
| <span class="n">df</span> <span class="o">=</span> <span class="n">mutils</span><span class="o">.</span><span class="n">dump_df</span><span class="p">(</span><span class="n">outdict</span><span class="p">)</span>
 | |
| </pre></div>
 | |
| </div>
 | |
| </div>
 | |
| <p>Check the dataframe:</p>
 | |
| <div class="nbinput docutils container">
 | |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[8]:
 | |
| </pre></div>
 | |
| </div>
 | |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span>
 | |
| </pre></div>
 | |
| </div>
 | |
| </div>
 | |
| <div class="nboutput nblast docutils container">
 | |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[8]:
 | |
| </pre></div>
 | |
| </div>
 | |
| <div class="output_area rendered_html docutils container">
 | |
| <div>
 | |
| <style scoped>
 | |
|     .dataframe tbody tr th:only-of-type {
 | |
|         vertical-align: middle;
 | |
|     }
 | |
| 
 | |
|     .dataframe tbody tr th {
 | |
|         vertical-align: top;
 | |
|     }
 | |
| 
 | |
|     .dataframe thead th {
 | |
|         text-align: right;
 | |
|     }
 | |
| </style>
 | |
| <table border="1" class="dataframe">
 | |
|   <thead>
 | |
|     <tr style="text-align: right;">
 | |
|       <th></th>
 | |
|       <th>filename</th>
 | |
|       <th>text</th>
 | |
|       <th>text_language</th>
 | |
|       <th>text_english</th>
 | |
|       <th>text_summary</th>
 | |
|       <th>sentiment</th>
 | |
|       <th>sentiment_score</th>
 | |
|       <th>entity</th>
 | |
|       <th>entity_type</th>
 | |
|     </tr>
 | |
|   </thead>
 | |
|   <tbody>
 | |
|     <tr>
 | |
|       <th>0</th>
 | |
|       <td>data/106349S_por.png</td>
 | |
|       <td>NEWS URGENTE SAMSUNG AO VIVO Rio de Janeiro NO...</td>
 | |
|       <td>pt</td>
 | |
|       <td>NEWS URGENT SAMSUNG LIVE Rio de Janeiro NEW CO...</td>
 | |
|       <td>NEW COUNTING METHOD RJ City HALL EXCLUDES 1,1...</td>
 | |
|       <td>NEGATIVE</td>
 | |
|       <td>0.99</td>
 | |
|       <td>[Rio de Janeiro, C, ##IT, P, ##NA, ##LTO]</td>
 | |
|       <td>[LOC, ORG, LOC, LOC, ORG, LOC]</td>
 | |
|     </tr>
 | |
|     <tr>
 | |
|       <th>1</th>
 | |
|       <td>data/102141_2_eng.png</td>
 | |
|       <td>CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE...</td>
 | |
|       <td>en</td>
 | |
|       <td>CORONAVIRUS QUARANTINE CORONAVIRUS OUTBREAK BE...</td>
 | |
|       <td>Coronavirus QUARANTINE CORONAVIRUS OUTBREAK</td>
 | |
|       <td>NEGATIVE</td>
 | |
|       <td>0.98</td>
 | |
|       <td>[CORONAVIRUS, ##AR, ##TI, ##RONAVIR, ##C, Co]</td>
 | |
|       <td>[ORG, MISC, MISC, ORG, MISC, MISC]</td>
 | |
|     </tr>
 | |
|     <tr>
 | |
|       <th>2</th>
 | |
|       <td>data/102730_eng.png</td>
 | |
|       <td>400 DEATHS GET E-BOOK X AN Corporation ncy Ser...</td>
 | |
|       <td>en</td>
 | |
|       <td>400 DEATHS GET E-BOOK X AN Corporation ncy Ser...</td>
 | |
|       <td>A municipal worker sprays disinfectant on his...</td>
 | |
|       <td>NEGATIVE</td>
 | |
|       <td>0.99</td>
 | |
|       <td>[AN Corporation ncy Services, Ahmedabad, RE, #...</td>
 | |
|       <td>[ORG, LOC, PER, ORG]</td>
 | |
|     </tr>
 | |
|   </tbody>
 | |
| </table>
 | |
| </div></div>
 | |
| </div>
 | |
| <p>Write the csv file - here you should provide a file path and file name for the csv file to be written.</p>
 | |
| <div class="nbinput nblast docutils container">
 | |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[9]:
 | |
| </pre></div>
 | |
| </div>
 | |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Write the csv</span>
 | |
| <span class="n">df</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="s2">"./data_out.csv"</span><span class="p">)</span>
 | |
| </pre></div>
 | |
| </div>
 | |
| </div>
 | |
| </section>
 | |
| <section id="Topic-analysis">
 | |
| <h2>Topic analysis<a class="headerlink" href="#Topic-analysis" title="Permalink to this heading"></a></h2>
 | |
| <p>The topic analysis is carried out using <a class="reference external" href="https://maartengr.github.io/BERTopic/index.html">BERTopic</a> using an embedded model through a <a class="reference external" href="https://spacy.io/">spaCy</a> pipeline.</p>
 | |
| <p>BERTopic takes a list of strings as input. The more items in the list, the better for the topic modeling. If the below returns an error for <code class="docutils literal notranslate"><span class="pre">analyse_topic()</span></code>, the reason can be that your dataset is too small.</p>
 | |
| <p>You can pass which dataframe entry you would like to have analyzed. The default is <code class="docutils literal notranslate"><span class="pre">text_english</span></code>, but you could for example also select <code class="docutils literal notranslate"><span class="pre">text_summary</span></code> or <code class="docutils literal notranslate"><span class="pre">text_english_correct</span></code> setting the keyword <code class="docutils literal notranslate"><span class="pre">analyze_text</span></code> as so:</p>
 | |
| <p><code class="docutils literal notranslate"><span class="pre">ammico.text.PostprocessText(mydict=mydict,</span> <span class="pre">analyze_text="text_summary").analyse_topic()</span></code></p>
 | |
| <section id="Option-1:-Use-the-dictionary-as-obtained-from-the-above-analysis.">
 | |
| <h3>Option 1: Use the dictionary as obtained from the above analysis.<a class="headerlink" href="#Option-1:-Use-the-dictionary-as-obtained-from-the-above-analysis." title="Permalink to this heading"></a></h3>
 | |
| <div class="nbinput docutils container">
 | |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[10]:
 | |
| </pre></div>
 | |
| </div>
 | |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># make a list of all the text_english entries per analysed image from the mydict variable as above</span>
 | |
| <span class="n">topic_model</span><span class="p">,</span> <span class="n">topic_df</span><span class="p">,</span> <span class="n">most_frequent_topics</span> <span class="o">=</span> <span class="n">ammico</span><span class="o">.</span><span class="n">text</span><span class="o">.</span><span class="n">PostprocessText</span><span class="p">(</span>
 | |
|     <span class="n">mydict</span><span class="o">=</span><span class="n">mydict</span>
 | |
| <span class="p">)</span><span class="o">.</span><span class="n">analyse_topic</span><span class="p">()</span>
 | |
| </pre></div>
 | |
| </div>
 | |
| </div>
 | |
| <div class="nboutput docutils container">
 | |
| <div class="prompt empty docutils container">
 | |
| </div>
 | |
| <div class="output_area docutils container">
 | |
| <div class="highlight"><pre>
 | |
| Reading data from dict.
 | |
| huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
 | |
| To disable this warning, you can either:
 | |
|         - Avoid using `tokenizers` before the fork if possible
 | |
|         - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 | |
| Collecting en-core-web-md==3.5.0
 | |
|   Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
 | |
|      ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.8/42.8 MB 58.4 MB/s eta 0:00:00
 | |
| Requirement already satisfied: spacy<3.6.0,>=3.5.0 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from en-core-web-md==3.5.0) (3.5.3)
 | |
| Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.0.12)
 | |
| Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.0.4)
 | |
| Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.0.9)
 | |
| Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.0.7)
 | |
| Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.0.8)
 | |
| Requirement already satisfied: thinc<8.2.0,>=8.1.8 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (8.1.10)
 | |
| Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.1.2)
 | |
| Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.4.6)
 | |
| Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.0.8)
 | |
| Requirement already satisfied: typer<0.8.0,>=0.3.0 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.7.0)
 | |
| Requirement already satisfied: pathy>=0.10.0 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.10.2)
 | |
| Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (6.3.0)
 | |
| Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (4.65.0)
 | |
| Requirement already satisfied: numpy>=1.15.0 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.23.4)
 | |
| Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.31.0)
 | |
| Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.10.9)
 | |
| Requirement already satisfied: jinja2 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.1.2)
 | |
| Requirement already satisfied: setuptools in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (58.1.0)
 | |
| Requirement already satisfied: packaging>=20.0 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (23.1)
 | |
| Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.3.0)
 | |
| Requirement already satisfied: typing-extensions>=4.2.0 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (4.6.3)
 | |
| Requirement already satisfied: charset-normalizer<4,>=2 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (3.1.0)
 | |
| Requirement already satisfied: idna<4,>=2.5 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.10)
 | |
| Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (1.26.16)
 | |
| Requirement already satisfied: certifi>=2017.4.17 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2023.5.7)
 | |
| Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.7.9)
 | |
| Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (0.0.4)
 | |
| Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from typer<0.8.0,>=0.3.0->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (8.1.3)
 | |
| Requirement already satisfied: MarkupSafe>=2.0 in /opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages (from jinja2->spacy<3.6.0,>=3.5.0->en-core-web-md==3.5.0) (2.1.3)
 | |
| Installing collected packages: en-core-web-md
 | |
| Successfully installed en-core-web-md-3.5.0
 | |
| <span class="ansi-green-fg">✔ Download and installation successful</span>
 | |
| You can now load the package via spacy.load('en_core_web_md')
 | |
| </pre></div></div>
 | |
| </div>
 | |
| <div class="nboutput docutils container">
 | |
| <div class="prompt empty docutils container">
 | |
| </div>
 | |
| <div class="output_area stderr docutils container">
 | |
| <div class="highlight"><pre>
 | |
| 
 | |
| [notice] A new release of pip is available: 23.0.1 -> 23.1.2
 | |
| [notice] To update, run: pip install --upgrade pip
 | |
| </pre></div></div>
 | |
| </div>
 | |
| <div class="nboutput nblast docutils container">
 | |
| <div class="prompt empty docutils container">
 | |
| </div>
 | |
| <div class="output_area docutils container">
 | |
| <div class="highlight"><pre>
 | |
| <span class="ansi-red-fg">---------------------------------------------------------------------------</span>
 | |
| <span class="ansi-red-fg">TypeError</span>                                 Traceback (most recent call last)
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/bertopic/_bertopic.py:2868</span>, in <span class="ansi-cyan-fg">BERTopic._reduce_dimensionality</span><span class="ansi-blue-fg">(self, embeddings, y, partial_fit)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2867</span> <span class="ansi-bold" style="color: rgb(0,135,0)">try</span>:
 | |
| <span class="ansi-green-fg">-> 2868</span>     <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">umap_model</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">fit</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">embeddings</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">y</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">y</span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2869</span> <span class="ansi-bold" style="color: rgb(0,135,0)">except</span> <span class="ansi-bold" style="color: rgb(215,95,95)">TypeError</span>:
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/umap/umap_.py:2684</span>, in <span class="ansi-cyan-fg">UMAP.fit</span><span class="ansi-blue-fg">(self, X, y)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2683</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>transform_mode <span style="color: rgb(98,98,98)">==</span> <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">embedding</span><span style="color: rgb(175,0,0)">"</span>:
 | |
| <span class="ansi-green-fg">-> 2684</span>     <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>embedding_, aux_data <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_fit_embed_data</span><span class="ansi-yellow-bg">(</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2685</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_raw_data</span><span class="ansi-yellow-bg">[</span><span class="ansi-yellow-bg">index</span><span class="ansi-yellow-bg">]</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2686</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">n_epochs</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2687</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">init</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2688</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">random_state</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg">  </span><span class="ansi-yellow-bg" style="color: rgb(95,135,135)"># JH why raw data?</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2689</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2690</span>     <span style="color: rgb(95,135,135)"># Assign any points that are fully disconnected from our manifold(s) to have embedding</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2691</span>     <span style="color: rgb(95,135,135)"># coordinates of np.nan.  These will be filtered by our plotting functions automatically.</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2692</span>     <span style="color: rgb(95,135,135)"># They also prevent users from being deceived a distance query to one of these points.</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2693</span>     <span style="color: rgb(95,135,135)"># Might be worth moving this into simplicial_set_embedding or _fit_embed_data</span>
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/umap/umap_.py:2717</span>, in <span class="ansi-cyan-fg">UMAP._fit_embed_data</span><span class="ansi-blue-fg">(self, X, n_epochs, init, random_state)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2714</span> <span style="color: rgb(175,0,0)">"""A method wrapper for simplicial_set_embedding that can be</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2715</span> <span style="color: rgb(175,0,0)">replaced by subclasses.</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2716</span> <span style="color: rgb(175,0,0)">"""</span>
 | |
| <span class="ansi-green-fg">-> 2717</span> <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg">simplicial_set_embedding</span><span class="ansi-yellow-bg">(</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2718</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">X</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2719</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">graph_</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2720</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">n_components</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2721</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_initial_alpha</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2722</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_a</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2723</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_b</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2724</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">repulsion_strength</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2725</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">negative_sample_rate</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2726</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">n_epochs</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2727</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">init</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2728</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">random_state</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2729</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_input_distance_func</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2730</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_metric_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2731</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">densmap</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2732</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_densmap_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2733</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">output_dens</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2734</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_output_distance_func</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2735</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_output_metric_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2736</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">output_metric</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg ansi-bold" style="color: rgb(175,0,255)">in</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">euclidean</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">l2</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg">)</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2737</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">random_state</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg ansi-bold" style="color: rgb(175,0,255)">is</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg ansi-bold" style="color: rgb(0,135,0)">None</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2738</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">verbose</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2739</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">tqdm_kwds</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">tqdm_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2740</span> <span class="ansi-yellow-bg">)</span>
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/umap/umap_.py:1078</span>, in <span class="ansi-cyan-fg">simplicial_set_embedding</span><span class="ansi-blue-fg">(data, graph, n_components, initial_alpha, a, b, gamma, negative_sample_rate, n_epochs, init, random_state, metric, metric_kwds, densmap, densmap_kwds, output_dens, output_metric, output_metric_kwds, euclidean_output, parallel, verbose, tqdm_kwds)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1076</span> <span class="ansi-bold" style="color: rgb(0,135,0)">elif</span> <span style="color: rgb(0,135,0)">isinstance</span>(init, <span style="color: rgb(0,135,0)">str</span>) <span class="ansi-bold" style="color: rgb(175,0,255)">and</span> init <span style="color: rgb(98,98,98)">==</span> <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">spectral</span><span style="color: rgb(175,0,0)">"</span>:
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1077</span>     <span style="color: rgb(95,135,135)"># We add a little noise to avoid local minima for optimization to come</span>
 | |
| <span class="ansi-green-fg">-> 1078</span>     initialisation <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">spectral_layout</span><span class="ansi-yellow-bg">(</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1079</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">data</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1080</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">graph</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1081</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">n_components</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1082</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">random_state</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1083</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">metric</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">metric</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1084</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">metric_kwds</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">metric_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1085</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1086</span>     expansion <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(98,98,98)">10.0</span> <span style="color: rgb(98,98,98)">/</span> np<span style="color: rgb(98,98,98)">.</span>abs(initialisation)<span style="color: rgb(98,98,98)">.</span>max()
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/umap/spectral.py:332</span>, in <span class="ansi-cyan-fg">spectral_layout</span><span class="ansi-blue-fg">(data, graph, dim, random_state, metric, metric_kwds)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    331</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> L<span style="color: rgb(98,98,98)">.</span>shape[<span style="color: rgb(98,98,98)">0</span>] <span style="color: rgb(98,98,98)"><</span> <span style="color: rgb(98,98,98)">2000000</span>:
 | |
| <span class="ansi-green-fg">--> 332</span>     eigenvalues, eigenvectors <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">scipy</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">sparse</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">linalg</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">eigsh</span><span class="ansi-yellow-bg">(</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    333</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">L</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    334</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">k</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    335</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">which</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">SM</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    336</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">ncv</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">num_lanczos_vectors</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    337</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">tol</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">1e-4</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    338</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">v0</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">np</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">ones</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">L</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">shape</span><span class="ansi-yellow-bg">[</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">0</span><span class="ansi-yellow-bg">]</span><span class="ansi-yellow-bg">)</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    339</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">maxiter</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">graph</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">shape</span><span class="ansi-yellow-bg">[</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">0</span><span class="ansi-yellow-bg">]</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">5</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    340</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    341</span> <span class="ansi-bold" style="color: rgb(0,135,0)">else</span>:
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/scipy/sparse/linalg/_eigen/arpack/arpack.py:1605</span>, in <span class="ansi-cyan-fg">eigsh</span><span class="ansi-blue-fg">(A, k, M, sigma, which, v0, ncv, maxiter, tol, return_eigenvectors, Minv, OPinv, mode)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1604</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> issparse(A):
 | |
| <span class="ansi-green-fg">-> 1605</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">raise</span> <span class="ansi-bold" style="color: rgb(215,95,95)">TypeError</span>(<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">Cannot use scipy.linalg.eigh for sparse A with </span><span style="color: rgb(175,0,0)">"</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1606</span>                     <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">k >= N. Use scipy.linalg.eigh(A.toarray()) or</span><span style="color: rgb(175,0,0)">"</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1607</span>                     <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)"> reduce k.</span><span style="color: rgb(175,0,0)">"</span>)
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1608</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span style="color: rgb(0,135,0)">isinstance</span>(A, LinearOperator):
 | |
| 
 | |
| <span class="ansi-red-fg">TypeError</span>: Cannot use scipy.linalg.eigh for sparse A with k >= N. Use scipy.linalg.eigh(A.toarray()) or reduce k.
 | |
| 
 | |
| During handling of the above exception, another exception occurred:
 | |
| 
 | |
| <span class="ansi-red-fg">TypeError</span>                                 Traceback (most recent call last)
 | |
| Cell <span class="ansi-green-fg">In[10], line 2</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">      1</span> <span style="color: rgb(95,135,135)"># make a list of all the text_english entries per analysed image from the mydict variable as above</span>
 | |
| <span class="ansi-green-fg">----> 2</span> topic_model, topic_df, most_frequent_topics <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">ammico</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">text</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">PostprocessText</span><span class="ansi-yellow-bg">(</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">      3</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">mydict</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">mydict</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">      4</span> <span class="ansi-yellow-bg">)</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">analyse_topic</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">)</span>
 | |
| 
 | |
| File <span class="ansi-green-fg">~/work/AMMICO/AMMICO/ammico/text.py:221</span>, in <span class="ansi-cyan-fg">PostprocessText.analyse_topic</span><span class="ansi-blue-fg">(self, return_topics)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    219</span> <span class="ansi-bold" style="color: rgb(0,135,0)">except</span> <span class="ansi-bold" style="color: rgb(215,95,95)">TypeError</span>:
 | |
| <span class="ansi-green-intense-fg ansi-bold">    220</span>     <span style="color: rgb(0,135,0)">print</span>(<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">BERTopic excited with an error - maybe your dataset is too small?</span><span style="color: rgb(175,0,0)">"</span>)
 | |
| <span class="ansi-green-fg">--> 221</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>topics, <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>probs <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">topic_model</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">fit_transform</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">list_text_english</span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    222</span> <span style="color: rgb(95,135,135)"># return the topic list</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    223</span> topic_df <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>topic_model<span style="color: rgb(98,98,98)">.</span>get_topic_info()
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/bertopic/_bertopic.py:356</span>, in <span class="ansi-cyan-fg">BERTopic.fit_transform</span><span class="ansi-blue-fg">(self, documents, embeddings, y)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    354</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>seed_topic_list <span class="ansi-bold" style="color: rgb(175,0,255)">is</span> <span class="ansi-bold" style="color: rgb(175,0,255)">not</span> <span class="ansi-bold" style="color: rgb(0,135,0)">None</span> <span class="ansi-bold" style="color: rgb(175,0,255)">and</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>embedding_model <span class="ansi-bold" style="color: rgb(175,0,255)">is</span> <span class="ansi-bold" style="color: rgb(175,0,255)">not</span> <span class="ansi-bold" style="color: rgb(0,135,0)">None</span>:
 | |
| <span class="ansi-green-intense-fg ansi-bold">    355</span>     y, embeddings <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_guided_topic_modeling(embeddings)
 | |
| <span class="ansi-green-fg">--> 356</span> umap_embeddings <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_reduce_dimensionality</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">embeddings</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">y</span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    358</span> <span style="color: rgb(95,135,135)"># Cluster reduced embeddings</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    359</span> documents, probabilities <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_cluster_embeddings(umap_embeddings, documents, y<span style="color: rgb(98,98,98)">=</span>y)
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/bertopic/_bertopic.py:2872</span>, in <span class="ansi-cyan-fg">BERTopic._reduce_dimensionality</span><span class="ansi-blue-fg">(self, embeddings, y, partial_fit)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2869</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">except</span> <span class="ansi-bold" style="color: rgb(215,95,95)">TypeError</span>:
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2870</span>         logger<span style="color: rgb(98,98,98)">.</span>info(<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">The dimensionality reduction algorithm did not contain the `y` parameter and</span><span style="color: rgb(175,0,0)">"</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2871</span>                     <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)"> therefore the `y` parameter was not used</span><span style="color: rgb(175,0,0)">"</span>)
 | |
| <span class="ansi-green-fg">-> 2872</span>         <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">umap_model</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">fit</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">embeddings</span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2874</span> umap_embeddings <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>umap_model<span style="color: rgb(98,98,98)">.</span>transform(embeddings)
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2875</span> logger<span style="color: rgb(98,98,98)">.</span>info(<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">Reduced dimensionality</span><span style="color: rgb(175,0,0)">"</span>)
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/umap/umap_.py:2684</span>, in <span class="ansi-cyan-fg">UMAP.fit</span><span class="ansi-blue-fg">(self, X, y)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2681</span>     <span style="color: rgb(0,135,0)">print</span>(ts(), <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">Construct embedding</span><span style="color: rgb(175,0,0)">"</span>)
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2683</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>transform_mode <span style="color: rgb(98,98,98)">==</span> <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">embedding</span><span style="color: rgb(175,0,0)">"</span>:
 | |
| <span class="ansi-green-fg">-> 2684</span>     <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>embedding_, aux_data <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_fit_embed_data</span><span class="ansi-yellow-bg">(</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2685</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_raw_data</span><span class="ansi-yellow-bg">[</span><span class="ansi-yellow-bg">index</span><span class="ansi-yellow-bg">]</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2686</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">n_epochs</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2687</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">init</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2688</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">random_state</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg">  </span><span class="ansi-yellow-bg" style="color: rgb(95,135,135)"># JH why raw data?</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2689</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2690</span>     <span style="color: rgb(95,135,135)"># Assign any points that are fully disconnected from our manifold(s) to have embedding</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2691</span>     <span style="color: rgb(95,135,135)"># coordinates of np.nan.  These will be filtered by our plotting functions automatically.</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2692</span>     <span style="color: rgb(95,135,135)"># They also prevent users from being deceived a distance query to one of these points.</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2693</span>     <span style="color: rgb(95,135,135)"># Might be worth moving this into simplicial_set_embedding or _fit_embed_data</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2694</span>     disconnected_vertices <span style="color: rgb(98,98,98)">=</span> np<span style="color: rgb(98,98,98)">.</span>array(<span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>graph_<span style="color: rgb(98,98,98)">.</span>sum(axis<span style="color: rgb(98,98,98)">=</span><span style="color: rgb(98,98,98)">1</span>))<span style="color: rgb(98,98,98)">.</span>flatten() <span style="color: rgb(98,98,98)">==</span> <span style="color: rgb(98,98,98)">0</span>
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/umap/umap_.py:2717</span>, in <span class="ansi-cyan-fg">UMAP._fit_embed_data</span><span class="ansi-blue-fg">(self, X, n_epochs, init, random_state)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2713</span> <span class="ansi-bold" style="color: rgb(0,135,0)">def</span> <span style="color: rgb(0,0,255)">_fit_embed_data</span>(<span style="color: rgb(0,135,0)">self</span>, X, n_epochs, init, random_state):
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2714</span> <span style="color: rgb(188,188,188)">    </span><span style="color: rgb(175,0,0)">"""A method wrapper for simplicial_set_embedding that can be</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2715</span> <span style="color: rgb(175,0,0)">    replaced by subclasses.</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2716</span> <span style="color: rgb(175,0,0)">    """</span>
 | |
| <span class="ansi-green-fg">-> 2717</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg">simplicial_set_embedding</span><span class="ansi-yellow-bg">(</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2718</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">X</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2719</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">graph_</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2720</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">n_components</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2721</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_initial_alpha</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2722</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_a</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2723</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_b</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2724</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">repulsion_strength</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2725</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">negative_sample_rate</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2726</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">n_epochs</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2727</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">init</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2728</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">random_state</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2729</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_input_distance_func</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2730</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_metric_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2731</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">densmap</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2732</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_densmap_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2733</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">output_dens</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2734</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_output_distance_func</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2735</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_output_metric_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2736</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">output_metric</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg ansi-bold" style="color: rgb(175,0,255)">in</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">euclidean</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">l2</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg">)</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2737</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">random_state</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg ansi-bold" style="color: rgb(175,0,255)">is</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg ansi-bold" style="color: rgb(0,135,0)">None</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2738</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">verbose</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2739</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">tqdm_kwds</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">tqdm_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2740</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">)</span>
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/umap/umap_.py:1078</span>, in <span class="ansi-cyan-fg">simplicial_set_embedding</span><span class="ansi-blue-fg">(data, graph, n_components, initial_alpha, a, b, gamma, negative_sample_rate, n_epochs, init, random_state, metric, metric_kwds, densmap, densmap_kwds, output_dens, output_metric, output_metric_kwds, euclidean_output, parallel, verbose, tqdm_kwds)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1073</span>     embedding <span style="color: rgb(98,98,98)">=</span> random_state<span style="color: rgb(98,98,98)">.</span>uniform(
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1074</span>         low<span style="color: rgb(98,98,98)">=</span><span style="color: rgb(98,98,98)">-</span><span style="color: rgb(98,98,98)">10.0</span>, high<span style="color: rgb(98,98,98)">=</span><span style="color: rgb(98,98,98)">10.0</span>, size<span style="color: rgb(98,98,98)">=</span>(graph<span style="color: rgb(98,98,98)">.</span>shape[<span style="color: rgb(98,98,98)">0</span>], n_components)
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1075</span>     )<span style="color: rgb(98,98,98)">.</span>astype(np<span style="color: rgb(98,98,98)">.</span>float32)
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1076</span> <span class="ansi-bold" style="color: rgb(0,135,0)">elif</span> <span style="color: rgb(0,135,0)">isinstance</span>(init, <span style="color: rgb(0,135,0)">str</span>) <span class="ansi-bold" style="color: rgb(175,0,255)">and</span> init <span style="color: rgb(98,98,98)">==</span> <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">spectral</span><span style="color: rgb(175,0,0)">"</span>:
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1077</span>     <span style="color: rgb(95,135,135)"># We add a little noise to avoid local minima for optimization to come</span>
 | |
| <span class="ansi-green-fg">-> 1078</span>     initialisation <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">spectral_layout</span><span class="ansi-yellow-bg">(</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1079</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">data</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1080</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">graph</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1081</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">n_components</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1082</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">random_state</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1083</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">metric</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">metric</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1084</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">metric_kwds</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">metric_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1085</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1086</span>     expansion <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(98,98,98)">10.0</span> <span style="color: rgb(98,98,98)">/</span> np<span style="color: rgb(98,98,98)">.</span>abs(initialisation)<span style="color: rgb(98,98,98)">.</span>max()
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1087</span>     embedding <span style="color: rgb(98,98,98)">=</span> (initialisation <span style="color: rgb(98,98,98)">*</span> expansion)<span style="color: rgb(98,98,98)">.</span>astype(
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1088</span>         np<span style="color: rgb(98,98,98)">.</span>float32
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1089</span>     ) <span style="color: rgb(98,98,98)">+</span> random_state<span style="color: rgb(98,98,98)">.</span>normal(
 | |
| <span class="ansi-green-fg">   (...)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1092</span>         np<span style="color: rgb(98,98,98)">.</span>float32
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1093</span>     )
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/umap/spectral.py:332</span>, in <span class="ansi-cyan-fg">spectral_layout</span><span class="ansi-blue-fg">(data, graph, dim, random_state, metric, metric_kwds)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    330</span> <span class="ansi-bold" style="color: rgb(0,135,0)">try</span>:
 | |
| <span class="ansi-green-intense-fg ansi-bold">    331</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> L<span style="color: rgb(98,98,98)">.</span>shape[<span style="color: rgb(98,98,98)">0</span>] <span style="color: rgb(98,98,98)"><</span> <span style="color: rgb(98,98,98)">2000000</span>:
 | |
| <span class="ansi-green-fg">--> 332</span>         eigenvalues, eigenvectors <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">scipy</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">sparse</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">linalg</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">eigsh</span><span class="ansi-yellow-bg">(</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    333</span> <span class="ansi-yellow-bg">            </span><span class="ansi-yellow-bg">L</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    334</span> <span class="ansi-yellow-bg">            </span><span class="ansi-yellow-bg">k</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    335</span> <span class="ansi-yellow-bg">            </span><span class="ansi-yellow-bg">which</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">SM</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    336</span> <span class="ansi-yellow-bg">            </span><span class="ansi-yellow-bg">ncv</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">num_lanczos_vectors</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    337</span> <span class="ansi-yellow-bg">            </span><span class="ansi-yellow-bg">tol</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">1e-4</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    338</span> <span class="ansi-yellow-bg">            </span><span class="ansi-yellow-bg">v0</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">np</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">ones</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">L</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">shape</span><span class="ansi-yellow-bg">[</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">0</span><span class="ansi-yellow-bg">]</span><span class="ansi-yellow-bg">)</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    339</span> <span class="ansi-yellow-bg">            </span><span class="ansi-yellow-bg">maxiter</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">graph</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">shape</span><span class="ansi-yellow-bg">[</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">0</span><span class="ansi-yellow-bg">]</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">5</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    340</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    341</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">else</span>:
 | |
| <span class="ansi-green-intense-fg ansi-bold">    342</span>         eigenvalues, eigenvectors <span style="color: rgb(98,98,98)">=</span> scipy<span style="color: rgb(98,98,98)">.</span>sparse<span style="color: rgb(98,98,98)">.</span>linalg<span style="color: rgb(98,98,98)">.</span>lobpcg(
 | |
| <span class="ansi-green-intense-fg ansi-bold">    343</span>             L, random_state<span style="color: rgb(98,98,98)">.</span>normal(size<span style="color: rgb(98,98,98)">=</span>(L<span style="color: rgb(98,98,98)">.</span>shape[<span style="color: rgb(98,98,98)">0</span>], k)), largest<span style="color: rgb(98,98,98)">=</span><span class="ansi-bold" style="color: rgb(0,135,0)">False</span>, tol<span style="color: rgb(98,98,98)">=</span><span style="color: rgb(98,98,98)">1e-8</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    344</span>         )
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/scipy/sparse/linalg/_eigen/arpack/arpack.py:1605</span>, in <span class="ansi-cyan-fg">eigsh</span><span class="ansi-blue-fg">(A, k, M, sigma, which, v0, ncv, maxiter, tol, return_eigenvectors, Minv, OPinv, mode)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1600</span> warnings<span style="color: rgb(98,98,98)">.</span>warn(<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">k >= N for N * N square matrix. </span><span style="color: rgb(175,0,0)">"</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1601</span>               <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">Attempting to use scipy.linalg.eigh instead.</span><span style="color: rgb(175,0,0)">"</span>,
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1602</span>               <span class="ansi-bold" style="color: rgb(215,95,95)">RuntimeWarning</span>)
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1604</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> issparse(A):
 | |
| <span class="ansi-green-fg">-> 1605</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">raise</span> <span class="ansi-bold" style="color: rgb(215,95,95)">TypeError</span>(<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">Cannot use scipy.linalg.eigh for sparse A with </span><span style="color: rgb(175,0,0)">"</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1606</span>                     <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">k >= N. Use scipy.linalg.eigh(A.toarray()) or</span><span style="color: rgb(175,0,0)">"</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1607</span>                     <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)"> reduce k.</span><span style="color: rgb(175,0,0)">"</span>)
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1608</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span style="color: rgb(0,135,0)">isinstance</span>(A, LinearOperator):
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1609</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">raise</span> <span class="ansi-bold" style="color: rgb(215,95,95)">TypeError</span>(<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">Cannot use scipy.linalg.eigh for LinearOperator </span><span style="color: rgb(175,0,0)">"</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1610</span>                     <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">A with k >= N.</span><span style="color: rgb(175,0,0)">"</span>)
 | |
| 
 | |
| <span class="ansi-red-fg">TypeError</span>: Cannot use scipy.linalg.eigh for sparse A with k >= N. Use scipy.linalg.eigh(A.toarray()) or reduce k.
 | |
| </pre></div></div>
 | |
| </div>
 | |
| </section>
 | |
| <section id="Option-2:-Read-in-a-csv">
 | |
| <h3>Option 2: Read in a csv<a class="headerlink" href="#Option-2:-Read-in-a-csv" title="Permalink to this heading"></a></h3>
 | |
| <p>Not to analyse too many images on google Cloud Vision, use the csv output to obtain the text (when rerunning already analysed images).</p>
 | |
| <div class="nbinput docutils container">
 | |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[11]:
 | |
| </pre></div>
 | |
| </div>
 | |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">input_file_path</span> <span class="o">=</span> <span class="s2">"data_out.csv"</span>
 | |
| <span class="n">topic_model</span><span class="p">,</span> <span class="n">topic_df</span><span class="p">,</span> <span class="n">most_frequent_topics</span> <span class="o">=</span> <span class="n">ammico</span><span class="o">.</span><span class="n">text</span><span class="o">.</span><span class="n">PostprocessText</span><span class="p">(</span>
 | |
|     <span class="n">use_csv</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">csv_path</span><span class="o">=</span><span class="n">input_file_path</span>
 | |
| <span class="p">)</span><span class="o">.</span><span class="n">analyse_topic</span><span class="p">(</span><span class="n">return_topics</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
 | |
| </pre></div>
 | |
| </div>
 | |
| </div>
 | |
| <div class="nboutput docutils container">
 | |
| <div class="prompt empty docutils container">
 | |
| </div>
 | |
| <div class="output_area docutils container">
 | |
| <div class="highlight"><pre>
 | |
| Reading data from df.
 | |
| </pre></div></div>
 | |
| </div>
 | |
| <div class="nboutput nblast docutils container">
 | |
| <div class="prompt empty docutils container">
 | |
| </div>
 | |
| <div class="output_area docutils container">
 | |
| <div class="highlight"><pre>
 | |
| <span class="ansi-red-fg">---------------------------------------------------------------------------</span>
 | |
| <span class="ansi-red-fg">TypeError</span>                                 Traceback (most recent call last)
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/bertopic/_bertopic.py:2868</span>, in <span class="ansi-cyan-fg">BERTopic._reduce_dimensionality</span><span class="ansi-blue-fg">(self, embeddings, y, partial_fit)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2867</span> <span class="ansi-bold" style="color: rgb(0,135,0)">try</span>:
 | |
| <span class="ansi-green-fg">-> 2868</span>     <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">umap_model</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">fit</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">embeddings</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">y</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">y</span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2869</span> <span class="ansi-bold" style="color: rgb(0,135,0)">except</span> <span class="ansi-bold" style="color: rgb(215,95,95)">TypeError</span>:
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/umap/umap_.py:2684</span>, in <span class="ansi-cyan-fg">UMAP.fit</span><span class="ansi-blue-fg">(self, X, y)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2683</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>transform_mode <span style="color: rgb(98,98,98)">==</span> <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">embedding</span><span style="color: rgb(175,0,0)">"</span>:
 | |
| <span class="ansi-green-fg">-> 2684</span>     <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>embedding_, aux_data <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_fit_embed_data</span><span class="ansi-yellow-bg">(</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2685</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_raw_data</span><span class="ansi-yellow-bg">[</span><span class="ansi-yellow-bg">index</span><span class="ansi-yellow-bg">]</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2686</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">n_epochs</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2687</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">init</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2688</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">random_state</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg">  </span><span class="ansi-yellow-bg" style="color: rgb(95,135,135)"># JH why raw data?</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2689</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2690</span>     <span style="color: rgb(95,135,135)"># Assign any points that are fully disconnected from our manifold(s) to have embedding</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2691</span>     <span style="color: rgb(95,135,135)"># coordinates of np.nan.  These will be filtered by our plotting functions automatically.</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2692</span>     <span style="color: rgb(95,135,135)"># They also prevent users from being deceived a distance query to one of these points.</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2693</span>     <span style="color: rgb(95,135,135)"># Might be worth moving this into simplicial_set_embedding or _fit_embed_data</span>
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/umap/umap_.py:2717</span>, in <span class="ansi-cyan-fg">UMAP._fit_embed_data</span><span class="ansi-blue-fg">(self, X, n_epochs, init, random_state)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2714</span> <span style="color: rgb(175,0,0)">"""A method wrapper for simplicial_set_embedding that can be</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2715</span> <span style="color: rgb(175,0,0)">replaced by subclasses.</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2716</span> <span style="color: rgb(175,0,0)">"""</span>
 | |
| <span class="ansi-green-fg">-> 2717</span> <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg">simplicial_set_embedding</span><span class="ansi-yellow-bg">(</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2718</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">X</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2719</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">graph_</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2720</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">n_components</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2721</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_initial_alpha</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2722</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_a</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2723</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_b</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2724</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">repulsion_strength</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2725</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">negative_sample_rate</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2726</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">n_epochs</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2727</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">init</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2728</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">random_state</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2729</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_input_distance_func</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2730</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_metric_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2731</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">densmap</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2732</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_densmap_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2733</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">output_dens</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2734</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_output_distance_func</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2735</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_output_metric_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2736</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">output_metric</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg ansi-bold" style="color: rgb(175,0,255)">in</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">euclidean</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">l2</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg">)</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2737</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">random_state</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg ansi-bold" style="color: rgb(175,0,255)">is</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg ansi-bold" style="color: rgb(0,135,0)">None</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2738</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">verbose</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2739</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">tqdm_kwds</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">tqdm_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2740</span> <span class="ansi-yellow-bg">)</span>
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/umap/umap_.py:1078</span>, in <span class="ansi-cyan-fg">simplicial_set_embedding</span><span class="ansi-blue-fg">(data, graph, n_components, initial_alpha, a, b, gamma, negative_sample_rate, n_epochs, init, random_state, metric, metric_kwds, densmap, densmap_kwds, output_dens, output_metric, output_metric_kwds, euclidean_output, parallel, verbose, tqdm_kwds)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1076</span> <span class="ansi-bold" style="color: rgb(0,135,0)">elif</span> <span style="color: rgb(0,135,0)">isinstance</span>(init, <span style="color: rgb(0,135,0)">str</span>) <span class="ansi-bold" style="color: rgb(175,0,255)">and</span> init <span style="color: rgb(98,98,98)">==</span> <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">spectral</span><span style="color: rgb(175,0,0)">"</span>:
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1077</span>     <span style="color: rgb(95,135,135)"># We add a little noise to avoid local minima for optimization to come</span>
 | |
| <span class="ansi-green-fg">-> 1078</span>     initialisation <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">spectral_layout</span><span class="ansi-yellow-bg">(</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1079</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">data</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1080</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">graph</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1081</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">n_components</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1082</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">random_state</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1083</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">metric</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">metric</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1084</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">metric_kwds</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">metric_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1085</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1086</span>     expansion <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(98,98,98)">10.0</span> <span style="color: rgb(98,98,98)">/</span> np<span style="color: rgb(98,98,98)">.</span>abs(initialisation)<span style="color: rgb(98,98,98)">.</span>max()
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/umap/spectral.py:332</span>, in <span class="ansi-cyan-fg">spectral_layout</span><span class="ansi-blue-fg">(data, graph, dim, random_state, metric, metric_kwds)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    331</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> L<span style="color: rgb(98,98,98)">.</span>shape[<span style="color: rgb(98,98,98)">0</span>] <span style="color: rgb(98,98,98)"><</span> <span style="color: rgb(98,98,98)">2000000</span>:
 | |
| <span class="ansi-green-fg">--> 332</span>     eigenvalues, eigenvectors <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">scipy</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">sparse</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">linalg</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">eigsh</span><span class="ansi-yellow-bg">(</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    333</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">L</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    334</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">k</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    335</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">which</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">SM</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    336</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">ncv</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">num_lanczos_vectors</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    337</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">tol</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">1e-4</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    338</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">v0</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">np</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">ones</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">L</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">shape</span><span class="ansi-yellow-bg">[</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">0</span><span class="ansi-yellow-bg">]</span><span class="ansi-yellow-bg">)</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    339</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">maxiter</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">graph</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">shape</span><span class="ansi-yellow-bg">[</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">0</span><span class="ansi-yellow-bg">]</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">5</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    340</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    341</span> <span class="ansi-bold" style="color: rgb(0,135,0)">else</span>:
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/scipy/sparse/linalg/_eigen/arpack/arpack.py:1605</span>, in <span class="ansi-cyan-fg">eigsh</span><span class="ansi-blue-fg">(A, k, M, sigma, which, v0, ncv, maxiter, tol, return_eigenvectors, Minv, OPinv, mode)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1604</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> issparse(A):
 | |
| <span class="ansi-green-fg">-> 1605</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">raise</span> <span class="ansi-bold" style="color: rgb(215,95,95)">TypeError</span>(<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">Cannot use scipy.linalg.eigh for sparse A with </span><span style="color: rgb(175,0,0)">"</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1606</span>                     <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">k >= N. Use scipy.linalg.eigh(A.toarray()) or</span><span style="color: rgb(175,0,0)">"</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1607</span>                     <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)"> reduce k.</span><span style="color: rgb(175,0,0)">"</span>)
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1608</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span style="color: rgb(0,135,0)">isinstance</span>(A, LinearOperator):
 | |
| 
 | |
| <span class="ansi-red-fg">TypeError</span>: Cannot use scipy.linalg.eigh for sparse A with k >= N. Use scipy.linalg.eigh(A.toarray()) or reduce k.
 | |
| 
 | |
| During handling of the above exception, another exception occurred:
 | |
| 
 | |
| <span class="ansi-red-fg">TypeError</span>                                 Traceback (most recent call last)
 | |
| Cell <span class="ansi-green-fg">In[11], line 2</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">      1</span> input_file_path <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">data_out.csv</span><span style="color: rgb(175,0,0)">"</span>
 | |
| <span class="ansi-green-fg">----> 2</span> topic_model, topic_df, most_frequent_topics <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">ammico</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">text</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">PostprocessText</span><span class="ansi-yellow-bg">(</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">      3</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">use_csv</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg ansi-bold" style="color: rgb(0,135,0)">True</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">csv_path</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">input_file_path</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">      4</span> <span class="ansi-yellow-bg">)</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">analyse_topic</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">return_topics</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">10</span><span class="ansi-yellow-bg">)</span>
 | |
| 
 | |
| File <span class="ansi-green-fg">~/work/AMMICO/AMMICO/ammico/text.py:221</span>, in <span class="ansi-cyan-fg">PostprocessText.analyse_topic</span><span class="ansi-blue-fg">(self, return_topics)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    219</span> <span class="ansi-bold" style="color: rgb(0,135,0)">except</span> <span class="ansi-bold" style="color: rgb(215,95,95)">TypeError</span>:
 | |
| <span class="ansi-green-intense-fg ansi-bold">    220</span>     <span style="color: rgb(0,135,0)">print</span>(<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">BERTopic excited with an error - maybe your dataset is too small?</span><span style="color: rgb(175,0,0)">"</span>)
 | |
| <span class="ansi-green-fg">--> 221</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>topics, <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>probs <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">topic_model</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">fit_transform</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">list_text_english</span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    222</span> <span style="color: rgb(95,135,135)"># return the topic list</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    223</span> topic_df <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>topic_model<span style="color: rgb(98,98,98)">.</span>get_topic_info()
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/bertopic/_bertopic.py:356</span>, in <span class="ansi-cyan-fg">BERTopic.fit_transform</span><span class="ansi-blue-fg">(self, documents, embeddings, y)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    354</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>seed_topic_list <span class="ansi-bold" style="color: rgb(175,0,255)">is</span> <span class="ansi-bold" style="color: rgb(175,0,255)">not</span> <span class="ansi-bold" style="color: rgb(0,135,0)">None</span> <span class="ansi-bold" style="color: rgb(175,0,255)">and</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>embedding_model <span class="ansi-bold" style="color: rgb(175,0,255)">is</span> <span class="ansi-bold" style="color: rgb(175,0,255)">not</span> <span class="ansi-bold" style="color: rgb(0,135,0)">None</span>:
 | |
| <span class="ansi-green-intense-fg ansi-bold">    355</span>     y, embeddings <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_guided_topic_modeling(embeddings)
 | |
| <span class="ansi-green-fg">--> 356</span> umap_embeddings <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_reduce_dimensionality</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">embeddings</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">y</span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    358</span> <span style="color: rgb(95,135,135)"># Cluster reduced embeddings</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    359</span> documents, probabilities <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_cluster_embeddings(umap_embeddings, documents, y<span style="color: rgb(98,98,98)">=</span>y)
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/bertopic/_bertopic.py:2872</span>, in <span class="ansi-cyan-fg">BERTopic._reduce_dimensionality</span><span class="ansi-blue-fg">(self, embeddings, y, partial_fit)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2869</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">except</span> <span class="ansi-bold" style="color: rgb(215,95,95)">TypeError</span>:
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2870</span>         logger<span style="color: rgb(98,98,98)">.</span>info(<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">The dimensionality reduction algorithm did not contain the `y` parameter and</span><span style="color: rgb(175,0,0)">"</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2871</span>                     <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)"> therefore the `y` parameter was not used</span><span style="color: rgb(175,0,0)">"</span>)
 | |
| <span class="ansi-green-fg">-> 2872</span>         <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">umap_model</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">fit</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">embeddings</span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2874</span> umap_embeddings <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>umap_model<span style="color: rgb(98,98,98)">.</span>transform(embeddings)
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2875</span> logger<span style="color: rgb(98,98,98)">.</span>info(<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">Reduced dimensionality</span><span style="color: rgb(175,0,0)">"</span>)
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/umap/umap_.py:2684</span>, in <span class="ansi-cyan-fg">UMAP.fit</span><span class="ansi-blue-fg">(self, X, y)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2681</span>     <span style="color: rgb(0,135,0)">print</span>(ts(), <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">Construct embedding</span><span style="color: rgb(175,0,0)">"</span>)
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2683</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>transform_mode <span style="color: rgb(98,98,98)">==</span> <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">embedding</span><span style="color: rgb(175,0,0)">"</span>:
 | |
| <span class="ansi-green-fg">-> 2684</span>     <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>embedding_, aux_data <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_fit_embed_data</span><span class="ansi-yellow-bg">(</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2685</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_raw_data</span><span class="ansi-yellow-bg">[</span><span class="ansi-yellow-bg">index</span><span class="ansi-yellow-bg">]</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2686</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">n_epochs</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2687</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">init</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2688</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">random_state</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg">  </span><span class="ansi-yellow-bg" style="color: rgb(95,135,135)"># JH why raw data?</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2689</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2690</span>     <span style="color: rgb(95,135,135)"># Assign any points that are fully disconnected from our manifold(s) to have embedding</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2691</span>     <span style="color: rgb(95,135,135)"># coordinates of np.nan.  These will be filtered by our plotting functions automatically.</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2692</span>     <span style="color: rgb(95,135,135)"># They also prevent users from being deceived a distance query to one of these points.</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2693</span>     <span style="color: rgb(95,135,135)"># Might be worth moving this into simplicial_set_embedding or _fit_embed_data</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2694</span>     disconnected_vertices <span style="color: rgb(98,98,98)">=</span> np<span style="color: rgb(98,98,98)">.</span>array(<span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>graph_<span style="color: rgb(98,98,98)">.</span>sum(axis<span style="color: rgb(98,98,98)">=</span><span style="color: rgb(98,98,98)">1</span>))<span style="color: rgb(98,98,98)">.</span>flatten() <span style="color: rgb(98,98,98)">==</span> <span style="color: rgb(98,98,98)">0</span>
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/umap/umap_.py:2717</span>, in <span class="ansi-cyan-fg">UMAP._fit_embed_data</span><span class="ansi-blue-fg">(self, X, n_epochs, init, random_state)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2713</span> <span class="ansi-bold" style="color: rgb(0,135,0)">def</span> <span style="color: rgb(0,0,255)">_fit_embed_data</span>(<span style="color: rgb(0,135,0)">self</span>, X, n_epochs, init, random_state):
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2714</span> <span style="color: rgb(188,188,188)">    </span><span style="color: rgb(175,0,0)">"""A method wrapper for simplicial_set_embedding that can be</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2715</span> <span style="color: rgb(175,0,0)">    replaced by subclasses.</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2716</span> <span style="color: rgb(175,0,0)">    """</span>
 | |
| <span class="ansi-green-fg">-> 2717</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg">simplicial_set_embedding</span><span class="ansi-yellow-bg">(</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2718</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">X</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2719</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">graph_</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2720</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">n_components</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2721</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_initial_alpha</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2722</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_a</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2723</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_b</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2724</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">repulsion_strength</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2725</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">negative_sample_rate</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2726</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">n_epochs</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2727</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">init</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2728</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">random_state</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2729</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_input_distance_func</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2730</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_metric_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2731</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">densmap</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2732</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_densmap_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2733</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">output_dens</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2734</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_output_distance_func</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2735</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_output_metric_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2736</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">output_metric</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg ansi-bold" style="color: rgb(175,0,255)">in</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">euclidean</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">l2</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg">)</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2737</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">random_state</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg ansi-bold" style="color: rgb(175,0,255)">is</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg ansi-bold" style="color: rgb(0,135,0)">None</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2738</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">verbose</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2739</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">tqdm_kwds</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">tqdm_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   2740</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">)</span>
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/umap/umap_.py:1078</span>, in <span class="ansi-cyan-fg">simplicial_set_embedding</span><span class="ansi-blue-fg">(data, graph, n_components, initial_alpha, a, b, gamma, negative_sample_rate, n_epochs, init, random_state, metric, metric_kwds, densmap, densmap_kwds, output_dens, output_metric, output_metric_kwds, euclidean_output, parallel, verbose, tqdm_kwds)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1073</span>     embedding <span style="color: rgb(98,98,98)">=</span> random_state<span style="color: rgb(98,98,98)">.</span>uniform(
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1074</span>         low<span style="color: rgb(98,98,98)">=</span><span style="color: rgb(98,98,98)">-</span><span style="color: rgb(98,98,98)">10.0</span>, high<span style="color: rgb(98,98,98)">=</span><span style="color: rgb(98,98,98)">10.0</span>, size<span style="color: rgb(98,98,98)">=</span>(graph<span style="color: rgb(98,98,98)">.</span>shape[<span style="color: rgb(98,98,98)">0</span>], n_components)
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1075</span>     )<span style="color: rgb(98,98,98)">.</span>astype(np<span style="color: rgb(98,98,98)">.</span>float32)
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1076</span> <span class="ansi-bold" style="color: rgb(0,135,0)">elif</span> <span style="color: rgb(0,135,0)">isinstance</span>(init, <span style="color: rgb(0,135,0)">str</span>) <span class="ansi-bold" style="color: rgb(175,0,255)">and</span> init <span style="color: rgb(98,98,98)">==</span> <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">spectral</span><span style="color: rgb(175,0,0)">"</span>:
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1077</span>     <span style="color: rgb(95,135,135)"># We add a little noise to avoid local minima for optimization to come</span>
 | |
| <span class="ansi-green-fg">-> 1078</span>     initialisation <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">spectral_layout</span><span class="ansi-yellow-bg">(</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1079</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">data</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1080</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">graph</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1081</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">n_components</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1082</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">random_state</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1083</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">metric</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">metric</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1084</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">metric_kwds</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">metric_kwds</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1085</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1086</span>     expansion <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(98,98,98)">10.0</span> <span style="color: rgb(98,98,98)">/</span> np<span style="color: rgb(98,98,98)">.</span>abs(initialisation)<span style="color: rgb(98,98,98)">.</span>max()
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1087</span>     embedding <span style="color: rgb(98,98,98)">=</span> (initialisation <span style="color: rgb(98,98,98)">*</span> expansion)<span style="color: rgb(98,98,98)">.</span>astype(
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1088</span>         np<span style="color: rgb(98,98,98)">.</span>float32
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1089</span>     ) <span style="color: rgb(98,98,98)">+</span> random_state<span style="color: rgb(98,98,98)">.</span>normal(
 | |
| <span class="ansi-green-fg">   (...)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1092</span>         np<span style="color: rgb(98,98,98)">.</span>float32
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1093</span>     )
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/umap/spectral.py:332</span>, in <span class="ansi-cyan-fg">spectral_layout</span><span class="ansi-blue-fg">(data, graph, dim, random_state, metric, metric_kwds)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    330</span> <span class="ansi-bold" style="color: rgb(0,135,0)">try</span>:
 | |
| <span class="ansi-green-intense-fg ansi-bold">    331</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> L<span style="color: rgb(98,98,98)">.</span>shape[<span style="color: rgb(98,98,98)">0</span>] <span style="color: rgb(98,98,98)"><</span> <span style="color: rgb(98,98,98)">2000000</span>:
 | |
| <span class="ansi-green-fg">--> 332</span>         eigenvalues, eigenvectors <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">scipy</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">sparse</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">linalg</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">eigsh</span><span class="ansi-yellow-bg">(</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    333</span> <span class="ansi-yellow-bg">            </span><span class="ansi-yellow-bg">L</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    334</span> <span class="ansi-yellow-bg">            </span><span class="ansi-yellow-bg">k</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    335</span> <span class="ansi-yellow-bg">            </span><span class="ansi-yellow-bg">which</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">SM</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    336</span> <span class="ansi-yellow-bg">            </span><span class="ansi-yellow-bg">ncv</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">num_lanczos_vectors</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    337</span> <span class="ansi-yellow-bg">            </span><span class="ansi-yellow-bg">tol</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">1e-4</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    338</span> <span class="ansi-yellow-bg">            </span><span class="ansi-yellow-bg">v0</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">np</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">ones</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">L</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">shape</span><span class="ansi-yellow-bg">[</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">0</span><span class="ansi-yellow-bg">]</span><span class="ansi-yellow-bg">)</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    339</span> <span class="ansi-yellow-bg">            </span><span class="ansi-yellow-bg">maxiter</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">graph</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">shape</span><span class="ansi-yellow-bg">[</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">0</span><span class="ansi-yellow-bg">]</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">5</span><span class="ansi-yellow-bg">,</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    340</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    341</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">else</span>:
 | |
| <span class="ansi-green-intense-fg ansi-bold">    342</span>         eigenvalues, eigenvectors <span style="color: rgb(98,98,98)">=</span> scipy<span style="color: rgb(98,98,98)">.</span>sparse<span style="color: rgb(98,98,98)">.</span>linalg<span style="color: rgb(98,98,98)">.</span>lobpcg(
 | |
| <span class="ansi-green-intense-fg ansi-bold">    343</span>             L, random_state<span style="color: rgb(98,98,98)">.</span>normal(size<span style="color: rgb(98,98,98)">=</span>(L<span style="color: rgb(98,98,98)">.</span>shape[<span style="color: rgb(98,98,98)">0</span>], k)), largest<span style="color: rgb(98,98,98)">=</span><span class="ansi-bold" style="color: rgb(0,135,0)">False</span>, tol<span style="color: rgb(98,98,98)">=</span><span style="color: rgb(98,98,98)">1e-8</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">    344</span>         )
 | |
| 
 | |
| File <span class="ansi-green-fg">/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/scipy/sparse/linalg/_eigen/arpack/arpack.py:1605</span>, in <span class="ansi-cyan-fg">eigsh</span><span class="ansi-blue-fg">(A, k, M, sigma, which, v0, ncv, maxiter, tol, return_eigenvectors, Minv, OPinv, mode)</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1600</span> warnings<span style="color: rgb(98,98,98)">.</span>warn(<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">k >= N for N * N square matrix. </span><span style="color: rgb(175,0,0)">"</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1601</span>               <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">Attempting to use scipy.linalg.eigh instead.</span><span style="color: rgb(175,0,0)">"</span>,
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1602</span>               <span class="ansi-bold" style="color: rgb(215,95,95)">RuntimeWarning</span>)
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1604</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> issparse(A):
 | |
| <span class="ansi-green-fg">-> 1605</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">raise</span> <span class="ansi-bold" style="color: rgb(215,95,95)">TypeError</span>(<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">Cannot use scipy.linalg.eigh for sparse A with </span><span style="color: rgb(175,0,0)">"</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1606</span>                     <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">k >= N. Use scipy.linalg.eigh(A.toarray()) or</span><span style="color: rgb(175,0,0)">"</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1607</span>                     <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)"> reduce k.</span><span style="color: rgb(175,0,0)">"</span>)
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1608</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span style="color: rgb(0,135,0)">isinstance</span>(A, LinearOperator):
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1609</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">raise</span> <span class="ansi-bold" style="color: rgb(215,95,95)">TypeError</span>(<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">Cannot use scipy.linalg.eigh for LinearOperator </span><span style="color: rgb(175,0,0)">"</span>
 | |
| <span class="ansi-green-intense-fg ansi-bold">   1610</span>                     <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">A with k >= N.</span><span style="color: rgb(175,0,0)">"</span>)
 | |
| 
 | |
| <span class="ansi-red-fg">TypeError</span>: Cannot use scipy.linalg.eigh for sparse A with k >= N. Use scipy.linalg.eigh(A.toarray()) or reduce k.
 | |
| </pre></div></div>
 | |
| </div>
 | |
| </section>
 | |
| <section id="Access-frequent-topics">
 | |
| <h3>Access frequent topics<a class="headerlink" href="#Access-frequent-topics" title="Permalink to this heading"></a></h3>
 | |
| <p>A topic of <code class="docutils literal notranslate"><span class="pre">-1</span></code> stands for an outlier and should be ignored. Topic count is the number of occurence of that topic. The output is structured from most frequent to least frequent topic.</p>
 | |
| <div class="nbinput docutils container">
 | |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[12]:
 | |
| </pre></div>
 | |
| </div>
 | |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="nb">print</span><span class="p">(</span><span class="n">topic_df</span><span class="p">)</span>
 | |
| </pre></div>
 | |
| </div>
 | |
| </div>
 | |
| <div class="nboutput nblast docutils container">
 | |
| <div class="prompt empty docutils container">
 | |
| </div>
 | |
| <div class="output_area docutils container">
 | |
| <div class="highlight"><pre>
 | |
| <span class="ansi-red-fg">---------------------------------------------------------------------------</span>
 | |
| <span class="ansi-red-fg">NameError</span>                                 Traceback (most recent call last)
 | |
| Cell <span class="ansi-green-fg">In[12], line 1</span>
 | |
| <span class="ansi-green-fg">----> 1</span> <span style="color: rgb(0,135,0)">print</span>(<span class="ansi-yellow-bg">topic_df</span>)
 | |
| 
 | |
| <span class="ansi-red-fg">NameError</span>: name 'topic_df' is not defined
 | |
| </pre></div></div>
 | |
| </div>
 | |
| </section>
 | |
| <section id="Get-information-for-specific-topic">
 | |
| <h3>Get information for specific topic<a class="headerlink" href="#Get-information-for-specific-topic" title="Permalink to this heading"></a></h3>
 | |
| <p>The most frequent topics can be accessed through <code class="docutils literal notranslate"><span class="pre">most_frequent_topics</span></code> with the most occuring topics first in the list.</p>
 | |
| <div class="nbinput docutils container">
 | |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[13]:
 | |
| </pre></div>
 | |
| </div>
 | |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">for</span> <span class="n">topic</span> <span class="ow">in</span> <span class="n">most_frequent_topics</span><span class="p">:</span>
 | |
|     <span class="nb">print</span><span class="p">(</span><span class="s2">"Topic:"</span><span class="p">,</span> <span class="n">topic</span><span class="p">)</span>
 | |
| </pre></div>
 | |
| </div>
 | |
| </div>
 | |
| <div class="nboutput nblast docutils container">
 | |
| <div class="prompt empty docutils container">
 | |
| </div>
 | |
| <div class="output_area docutils container">
 | |
| <div class="highlight"><pre>
 | |
| <span class="ansi-red-fg">---------------------------------------------------------------------------</span>
 | |
| <span class="ansi-red-fg">NameError</span>                                 Traceback (most recent call last)
 | |
| Cell <span class="ansi-green-fg">In[13], line 1</span>
 | |
| <span class="ansi-green-fg">----> 1</span> <span class="ansi-bold" style="color: rgb(0,135,0)">for</span> topic <span class="ansi-bold" style="color: rgb(175,0,255)">in</span> <span class="ansi-yellow-bg">most_frequent_topics</span>:
 | |
| <span class="ansi-green-intense-fg ansi-bold">      2</span>     <span style="color: rgb(0,135,0)">print</span>(<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">Topic:</span><span style="color: rgb(175,0,0)">"</span>, topic)
 | |
| 
 | |
| <span class="ansi-red-fg">NameError</span>: name 'most_frequent_topics' is not defined
 | |
| </pre></div></div>
 | |
| </div>
 | |
| </section>
 | |
| <section id="Topic-visualization">
 | |
| <h3>Topic visualization<a class="headerlink" href="#Topic-visualization" title="Permalink to this heading"></a></h3>
 | |
| <p>The topics can also be visualized. Careful: This only works if there is sufficient data (quantity and quality).</p>
 | |
| <div class="nbinput docutils container">
 | |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[14]:
 | |
| </pre></div>
 | |
| </div>
 | |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">topic_model</span><span class="o">.</span><span class="n">visualize_topics</span><span class="p">()</span>
 | |
| </pre></div>
 | |
| </div>
 | |
| </div>
 | |
| <div class="nboutput nblast docutils container">
 | |
| <div class="prompt empty docutils container">
 | |
| </div>
 | |
| <div class="output_area docutils container">
 | |
| <div class="highlight"><pre>
 | |
| <span class="ansi-red-fg">---------------------------------------------------------------------------</span>
 | |
| <span class="ansi-red-fg">NameError</span>                                 Traceback (most recent call last)
 | |
| Cell <span class="ansi-green-fg">In[14], line 1</span>
 | |
| <span class="ansi-green-fg">----> 1</span> <span class="ansi-yellow-bg">topic_model</span><span style="color: rgb(98,98,98)">.</span>visualize_topics()
 | |
| 
 | |
| <span class="ansi-red-fg">NameError</span>: name 'topic_model' is not defined
 | |
| </pre></div></div>
 | |
| </div>
 | |
| </section>
 | |
| <section id="Save-the-model">
 | |
| <h3>Save the model<a class="headerlink" href="#Save-the-model" title="Permalink to this heading"></a></h3>
 | |
| <p>The model can be saved for future use.</p>
 | |
| <div class="nbinput docutils container">
 | |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[15]:
 | |
| </pre></div>
 | |
| </div>
 | |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">topic_model</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="s2">"misinfo_posts"</span><span class="p">)</span>
 | |
| </pre></div>
 | |
| </div>
 | |
| </div>
 | |
| <div class="nboutput nblast docutils container">
 | |
| <div class="prompt empty docutils container">
 | |
| </div>
 | |
| <div class="output_area docutils container">
 | |
| <div class="highlight"><pre>
 | |
| <span class="ansi-red-fg">---------------------------------------------------------------------------</span>
 | |
| <span class="ansi-red-fg">NameError</span>                                 Traceback (most recent call last)
 | |
| Cell <span class="ansi-green-fg">In[15], line 1</span>
 | |
| <span class="ansi-green-fg">----> 1</span> <span class="ansi-yellow-bg">topic_model</span><span style="color: rgb(98,98,98)">.</span>save(<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">misinfo_posts</span><span style="color: rgb(175,0,0)">"</span>)
 | |
| 
 | |
| <span class="ansi-red-fg">NameError</span>: name 'topic_model' is not defined
 | |
| </pre></div></div>
 | |
| </div>
 | |
| <div class="nbinput nblast docutils container">
 | |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[ ]:
 | |
| </pre></div>
 | |
| </div>
 | |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span>
 | |
| </pre></div>
 | |
| </div>
 | |
| </div>
 | |
| </section>
 | |
| </section>
 | |
| </section>
 | |
| 
 | |
| 
 | |
|            </div>
 | |
|           </div>
 | |
|           <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
 | |
|         <a href="Example%20faces.html" class="btn btn-neutral float-left" title="Facial Expression recognition with DeepFace" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
 | |
|         <a href="Example%20summary.html" class="btn btn-neutral float-right" title="Image summary and visual question answering" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
 | |
|     </div>
 | |
| 
 | |
|   <hr/>
 | |
| 
 | |
|   <div role="contentinfo">
 | |
|     <p>© Copyright 2022, Scientific Software Center, Heidelberg University.</p>
 | |
|   </div>
 | |
| 
 | |
|   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
 | |
|     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
 | |
|     provided by <a href="https://readthedocs.org">Read the Docs</a>.
 | |
|    
 | |
| 
 | |
| </footer>
 | |
|         </div>
 | |
|       </div>
 | |
|     </section>
 | |
|   </div>
 | |
|   <script>
 | |
|       jQuery(function () {
 | |
|           SphinxRtdTheme.Navigation.enable(true);
 | |
|       });
 | |
|   </script> 
 | |
| 
 | |
| </body>
 | |
| </html> | 
