# Notebook for text extraction on image
Inga Ulusoy, SSC, July 2022

In [None]:
# if running on google colab
# flake8-noqa-cell
import os

if "google.colab" in str(get_ipython()):
 # update python version
 # install setuptools
 !pip install setuptools==61 -qqq
 # install misinformation
 !pip install git+https://github.com/ssciwr/misinformation.git -qqq
 # mount google drive for data and API key
 from google.colab import drive

 drive.mount("/content/drive")

In [None]:
import os
import misinformation
from misinformation import utils as mutils
from misinformation import display as mdisplay

In [None]:
# Here you need to provide the path to your google drive folder
# or local folder containing the images
images = mutils.find_files(
 path="/content/drive/MyDrive/misinformation-data/",
 limit=10,
)

In [None]:
mydict = mutils.initialize_dict(images)

# google cloud vision API
First 1000 images per month are free.

In [None]:
os.environ[
 "GOOGLE_APPLICATION_CREDENTIALS"
] = "/content/drive/MyDrive/misinformation-data/misinformation-campaign-981aa55a3b13.json"

## Inspect the elements per image

In [None]:
mdisplay.explore_analysis(mydict, identify="text-on-image")

## Or directly analyze for further processing

In [None]:
for key in mydict:
 print(key)
 mydict[key] = misinformation.text.TextDetector(
 mydict[key], analyse_text=True
 ).analyse_image()

## Convert to dataframe and write csv

In [None]:
outdict = mutils.append_data_to_dict(mydict)
df = mutils.dump_df(outdict)

In [None]:
# check the dataframe
df.head(10)

In [None]:
# Write the csv
df.to_csv("./data_out.csv")

# Topic analysis
The topic analysis is carried out using [BERTopic](https://maartengr.github.io/BERTopic/index.html) using an embedded model through a [spaCy](https://spacy.io/) pipeline.

BERTopic takes a list of strings as input. The more items in the list, the better for the topic modeling. If the below returns an error for `analyse_topic()`, the reason can be that your dataset is too small.
### Option 1: Use the dictionary as obtained from the above analysis.

In [None]:
# make a list of all the text_english entries per analysed image from the mydict variable as above
topic_model, topic_df, most_frequent_topics = misinformation.text.PostprocessText(
 mydict=mydict
).analyse_topic()

### Option 2: Read in a csv
Not to analyse too many images on google Cloud Vision, use the csv output to obtain the text (when rerunning already analysed images).

In [None]:
input_file_path = "data_out.csv"
topic_model, topic_df, most_frequent_topics = misinformation.text.PostprocessText(
 use_csv=True, csv_path=input_file_path
).analyse_topic(return_topics=10)

### Access frequent topics
A topic of `-1` stands for an outlier and should be ignored. Topic count is the number of occurence of that topic. The output is structured from most frequent to least frequent topic.

In [None]:
print(topic_df)

### Get information for specific topic
The most frequent topics can be accessed through `most_frequent_topics` with the most occuring topics first in the list.

In [None]:
for topic in most_frequent_topics:
 print("Topic:", topic)

### Topic visualization
The topics can also be visualized. Careful: This only works if there is sufficient data (quantity and quality).

In [None]:
topic_model.visualize_topics()

### Save the model
The model can be saved for future use.

In [None]:
topic_model.save("misinfo_posts")