зеркало из
https://github.com/ssciwr/AMMICO.git
synced 2025-10-29 13:06:04 +02:00
Text cleanup and sentiment analysis (#49)
* update notebook * comments * add jupyterlab * add text analysis capability * add bool in tests * add dependencies and spelling test * add test sentiment * update black pre-commit dependency for native nb support * update black version, find better sentiment test * test analyse_image
Этот коммит содержится в:
родитель
b585097f19
Коммит
54728e02bb
1
.github/workflows/ci.yml
поставляемый
1
.github/workflows/ci.yml
поставляемый
@ -28,6 +28,7 @@ jobs:
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install -e .
|
||||
python -m spacy download en_core_web_md
|
||||
python -m textblob.download_corpora
|
||||
- name: Run pytest
|
||||
run: |
|
||||
cd misinformation
|
||||
|
||||
@ -8,10 +8,10 @@ repos:
|
||||
rev: 22.12.0
|
||||
hooks:
|
||||
- id: black
|
||||
- repo: https://github.com/dfm/black_nbconvert
|
||||
rev: v0.4.0
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 22.12.0
|
||||
hooks:
|
||||
- id: black_nbconvert
|
||||
- id: black-jupyter
|
||||
- repo: https://github.com/pycqa/flake8
|
||||
rev: 6.0.0
|
||||
hooks:
|
||||
|
||||
@ -29,6 +29,18 @@ def test_TextDetector():
|
||||
assert test_obj.subdict["text_language"] is None
|
||||
assert test_obj.subdict["text_english"] is None
|
||||
assert test_obj.subdict["text_cleaned"] is None
|
||||
assert not test_obj.analyse_text
|
||||
assert not test_obj.analyse_topic
|
||||
|
||||
|
||||
@pytest.mark.gcv
|
||||
def test_analyse_image():
|
||||
for item in TESTDICT:
|
||||
test_obj = tt.TextDetector(TESTDICT[item])
|
||||
test_obj.analyse_image()
|
||||
test_obj = tt.TextDetector(TESTDICT[item], analyse_text=True)
|
||||
test_obj.analyse_image()
|
||||
test_obj = tt.TextDetector(TESTDICT[item], analyse_topic=True)
|
||||
|
||||
|
||||
@pytest.mark.gcv
|
||||
@ -58,7 +70,7 @@ def test_translate_text():
|
||||
|
||||
|
||||
def test_init_spacy():
|
||||
test_obj = tt.TextDetector(TESTDICT["IMG_3755"])
|
||||
test_obj = tt.TextDetector(TESTDICT["IMG_3755"], analyse_text=True)
|
||||
ref_file = "./test/data/text_IMG_3755.txt"
|
||||
with open(ref_file, "r") as file:
|
||||
reference_text = file.read()
|
||||
@ -75,3 +87,23 @@ def test_clean_text():
|
||||
test_obj.clean_text()
|
||||
result = "I like cats and"
|
||||
assert test_obj.subdict["text_clean"] == result
|
||||
|
||||
|
||||
def test_correct_spelling():
|
||||
mydict = {}
|
||||
test_obj = tt.TextDetector(mydict, analyse_text=True)
|
||||
test_obj.subdict["text_english"] = "I lik cats ad dogs."
|
||||
test_obj.correct_spelling()
|
||||
result = "I like cats ad dogs."
|
||||
assert test_obj.subdict["text_english_correct"] == result
|
||||
|
||||
|
||||
def test_sentiment_analysis():
|
||||
mydict = {}
|
||||
test_obj = tt.TextDetector(mydict, analyse_text=True)
|
||||
test_obj.subdict["text_english"] = "I love cats and dogs."
|
||||
test_obj._init_spacy()
|
||||
test_obj.correct_spelling()
|
||||
test_obj.sentiment_analysis()
|
||||
assert test_obj.subdict["polarity"] == 0.5
|
||||
assert test_obj.subdict["subjectivity"] == 0.6
|
||||
|
||||
@ -1,17 +1,29 @@
|
||||
from google.cloud import vision
|
||||
from googletrans import Translator
|
||||
import spacy
|
||||
from spacytextblob.spacytextblob import SpacyTextBlob
|
||||
from textblob import TextBlob
|
||||
import io
|
||||
from misinformation import utils
|
||||
|
||||
# make widgets work again
|
||||
# clean text has weird spaces and separation of "do n't"
|
||||
# increase coverage for text
|
||||
|
||||
|
||||
class TextDetector(utils.AnalysisMethod):
|
||||
def __init__(self, subdict: dict) -> None:
|
||||
def __init__(
|
||||
self, subdict: dict, analyse_text: bool = False, analyse_topic: bool = False
|
||||
) -> None:
|
||||
super().__init__(subdict)
|
||||
self.subdict.update(self.set_keys())
|
||||
self.translator = Translator()
|
||||
# spacy load should be separaate method with error if model not found / dynamic download
|
||||
self.nlp = spacy.load("en_core_web_md")
|
||||
self.analyse_text = analyse_text
|
||||
self.analyse_topic = analyse_topic
|
||||
if self.analyse_text:
|
||||
# spacy load should be separate method with error if model not found / dynamic download
|
||||
self.nlp = spacy.load("en_core_web_md")
|
||||
self.nlp.add_pipe("spacytextblob")
|
||||
|
||||
def set_keys(self) -> dict:
|
||||
params = {
|
||||
@ -25,8 +37,13 @@ class TextDetector(utils.AnalysisMethod):
|
||||
def analyse_image(self):
|
||||
self.get_text_from_image()
|
||||
self.translate_text()
|
||||
self._init_spacy()
|
||||
self.clean_text()
|
||||
if self.analyse_text:
|
||||
self._init_spacy()
|
||||
self.clean_text()
|
||||
self.correct_spelling()
|
||||
self.sentiment_analysis()
|
||||
if self.analyse_topic:
|
||||
self.analyse_topic()
|
||||
return self.subdict
|
||||
|
||||
def get_text_from_image(self):
|
||||
@ -65,3 +82,18 @@ class TextDetector(utils.AnalysisMethod):
|
||||
token.text
|
||||
) if token.pos_ != "NUM" and token.has_vector else None
|
||||
self.subdict["text_clean"] = " ".join(templist).rstrip().lstrip()
|
||||
|
||||
def correct_spelling(self):
|
||||
self.textblob = TextBlob(self.subdict["text_english"])
|
||||
self.subdict["text_english_correct"] = str(self.textblob.correct())
|
||||
|
||||
def sentiment_analysis(self):
|
||||
# self.subdict["sentiment"] = self.doc._.blob.sentiment_assessments.assessments
|
||||
# polarity is between [-1.0, 1.0]
|
||||
self.subdict["polarity"] = self.doc._.blob.polarity
|
||||
# subjectivity is a float within the range [0.0, 1.0]
|
||||
# where 0.0 is very objective and 1.0 is very subjective
|
||||
self.subdict["subjectivity"] = self.doc._.blob.subjectivity
|
||||
|
||||
def analyse_topic(self):
|
||||
pass
|
||||
|
||||
2
notebooks/facial_expressions.ipynb
сгенерированный
2
notebooks/facial_expressions.ipynb
сгенерированный
@ -201,7 +201,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
"version": "3.9.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
140
notebooks/get-text-from-image.ipynb
сгенерированный
140
notebooks/get-text-from-image.ipynb
сгенерированный
@ -28,7 +28,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"images = misinformation.find_files(path=\"../data/images-little-text/\", limit=1000)"
|
||||
"images = misinformation.find_files(path=\"../data/images-text/\", limit=1000)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -42,28 +42,6 @@
|
||||
" display(Image(filename=i))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "366e2060",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# start with only English\n",
|
||||
"mysubfiles = [i for i in images if \"eng\" in i]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b330b267",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for i in mysubfiles[0:10]:\n",
|
||||
" display(Image(filename=i))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@ -102,7 +80,7 @@
|
||||
"source": [
|
||||
"os.environ[\n",
|
||||
" \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
|
||||
"] = \"../data/seismic-bonfire-329406-412821a70264.json\""
|
||||
"] = \"../data/misinformation-campaign-981aa55a3b13.json\""
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -143,14 +121,6 @@
|
||||
" mydict[key] = misinformation.text.TextDetector(mydict[key]).analyse_image()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c75a2fff-9d59-4634-8d28-e90a909caa23",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@ -158,120 +128,54 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(mydict[\"104157S_eng\"][\"text\"])"
|
||||
"print(mydict[\"109237S_spa\"][\"text_clean\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3c063eda",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Convert to dataframe and write csv"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3724f8d8-be0f-44eb-8ff9-b425eee94efc",
|
||||
"id": "5709c2cd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test = mydict[\"104157S_eng\"][\"text\"][0]\n",
|
||||
"print(test)"
|
||||
"outdict = misinformation.utils.append_data_to_dict(mydict)\n",
|
||||
"df = misinformation.utils.dump_df(outdict)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2fb865ab-6d16-4a4c-b004-9d39fcba6812",
|
||||
"id": "c4f05637",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# flake8-noqa-cell\n",
|
||||
"from googletrans import Translator"
|
||||
"# check the dataframe\n",
|
||||
"df.head(10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "051ad473-8f75-40fc-ae90-7d8176cf816f",
|
||||
"id": "bf6c9ddb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"translator = Translator()"
|
||||
"# Write the csv\n",
|
||||
"df.to_csv(\"./data_out.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b571d900-8829-4095-904f-dfee3ce46041",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"result = translator.translate(test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e1ff0b21-d2cc-4a50-8e86-a45362f1a0a1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(result.text)\n",
|
||||
"print(result.src)\n",
|
||||
"print(result.origin)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e135f00e-cdd5-4931-8649-ba0b293e8bdd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# now clean the gibberish\n",
|
||||
"# do spell check\n",
|
||||
"# flake8-noqa-cell\n",
|
||||
"import contextualSpellCheck\n",
|
||||
"import spacy\n",
|
||||
"\n",
|
||||
"nlp = spacy.load(\"en_core_web_md\")\n",
|
||||
"contextualSpellCheck.add_to_pipe(nlp)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "01d17f6e-30ae-4cc1-ad03-11a3be6847c2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"doc = nlp(result.text)\n",
|
||||
"print(doc._.outcome_spellCheck)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0ef91dd2-70a8-4a04-b50f-5efba076bbb0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"final_text = []\n",
|
||||
"for token in doc:\n",
|
||||
" if token.pos_ != \"SPACE\":\n",
|
||||
" if token.pos_ != \"NUM\":\n",
|
||||
" if token.has_vector:\n",
|
||||
" final_text.append(token.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4d0a3360-3fd9-4ab3-b4dc-122e90c16c7c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\" \".join(final_text))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b1abe05e-0c2b-4769-8fa5-fc67297288ea",
|
||||
"id": "568537df",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
@ -293,7 +197,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
"version": "3.9.0"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
|
||||
57
notebooks/objects_expression.ipynb
сгенерированный
57
notebooks/objects_expression.ipynb
сгенерированный
@ -38,7 +38,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"images = misinformation.find_files(\n",
|
||||
" path=\"/home/inga/projects/misinformation-project/misinformation/data/test_no_text/\",\n",
|
||||
" path=\"../data/images-little-text/\",\n",
|
||||
" limit=1000,\n",
|
||||
")"
|
||||
]
|
||||
@ -52,6 +52,15 @@
|
||||
"mydict = misinformation.utils.initialize_dict(images)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"mydict"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@ -136,6 +145,50 @@
|
||||
"misinformation.explore_analysis(mydict, identify=\"objects\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def localize_objects(path):\n",
|
||||
" \"\"\"Localize objects in the local image.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" path: The path to the local file.\n",
|
||||
" \"\"\"\n",
|
||||
" from google.cloud import vision\n",
|
||||
"\n",
|
||||
" client = vision.ImageAnnotatorClient()\n",
|
||||
"\n",
|
||||
" with open(path, \"rb\") as image_file:\n",
|
||||
" content = image_file.read()\n",
|
||||
" image = vision.Image(content=content)\n",
|
||||
"\n",
|
||||
" objects = client.object_localization(image=image).localized_object_annotations\n",
|
||||
"\n",
|
||||
" print(\"Number of objects found: {}\".format(len(objects)))\n",
|
||||
" for object_ in objects:\n",
|
||||
" print(\"\\n{} (confidence: {})\".format(object_.name, object_.score))\n",
|
||||
" print(\"Normalized bounding polygon vertices: \")\n",
|
||||
" for vertex in object_.bounding_poly.normalized_vertices:\n",
|
||||
" print(\" - ({}, {})\".format(vertex.x, vertex.y))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\n",
|
||||
" \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
|
||||
"] = \"../../misinformation-notes/seismic-bonfire-329406-412821a70264.json\"\n",
|
||||
"localize_objects(\"/home/iulusoy/Desktop/102141_2_eng.png\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@ -160,7 +213,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.5"
|
||||
"version": "3.9.0"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
|
||||
@ -42,6 +42,9 @@ dependencies = [
|
||||
"opencv-contrib-python",
|
||||
"googletrans==3.1.0a0",
|
||||
"spacy",
|
||||
"jupyterlab",
|
||||
"spacytextblob",
|
||||
"textblob",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
@ -18,4 +18,7 @@ matplotlib
|
||||
opencv-contrib-python
|
||||
googletrans==3.1.0a0
|
||||
spacy
|
||||
https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1.tar.gz
|
||||
https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1.tar.gz
|
||||
jupyterlab
|
||||
spacytextblob
|
||||
textblob
|
||||
Загрузка…
x
Ссылка в новой задаче
Block a user