299 строки
12 KiB
Plaintext
299 строки
12 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"5 - Email analysis:\n",
|
|
"Below are code examples that will help you parse the single email message file, or whole directory with malicous emails and exctracts various medatata for further use.\n",
|
|
"It can be combined with previous codes e.g to exctrat named entities, sentiment or even draw a graph."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Directory and exmaple file selection:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#----\n",
|
|
"directory=\"./emails/\"\n",
|
|
"filename=directory+'1.eml'\n",
|
|
"#----"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Working example for mailparser library, that you can use for your for further work:\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[1] [('Оргкомитет', 'news@hrsummit.ru')] <class 'list'>\n",
|
|
"[2] []\n",
|
|
"[3] set()\n",
|
|
"[4] [('', 'karasek.jindrich@gmail.com')]\n",
|
|
"[5] ['gmail.com']\n",
|
|
"[6] [{'by': 'smtp368.emlone.com', 'id': 'hed7s62erm86', 'for': '<karasek.jindrich@gmail.com>', 'envelope_from': 'postman3496981@justeml.com', 'date': 'Tue, 15 Nov 2022 08:37:23 +0000 envelope-from <postman3496981@justeml.com>', 'hop': 1, 'date_utc': '2022-11-15T08:37:23', 'delay': 0}, {'from': 'smtp573.emlone.com smtp573.emlone.com. 87.246.187.152', 'by': 'mx.google.com', 'with': 'ESMTPS', 'id': 'i1-20020a2ea361000000b0026fbd8bb585si5659533ljn.64.2022.11.15.00.37.23', 'for': '<karasek.jindrich@gmail.com> version=TLS1_2 cipher=ECDHE-ECDSA-AES128-GCM-SHA256 bits=128/128', 'date': 'Tue, 15 Nov 2022 00:37:23 -0800 PST', 'hop': 2, 'date_utc': '2022-11-15T08:37:23', 'delay': 0.0}, {'by': '2002:a05:6638:59c:b0:375:9b14:11af', 'with': 'SMTP', 'id': 'a28csp3262577jar', 'date': 'Tue, 15 Nov 2022 00:37:24 -0800 PST', 'hop': 3, 'date_utc': '2022-11-15T08:37:24', 'delay': 1.0}]\n",
|
|
"[7] None\n",
|
|
"[8] [('', 'karasek.jindrich@gmail.com')]\n",
|
|
"[9] []\n",
|
|
"[10] Один день до CDO/CDTO Summit & Award 17 НОЯБРЯ 2022\n",
|
|
"[11] []\n",
|
|
"[16] <E1ourRb-P0bP3O-NK@ucs301-ucs-6.msgpanel.com>\n",
|
|
"[19] +0.0\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import mailparser\n",
|
|
"\n",
|
|
"#https://pypi.org/project/mail-parser/\n",
|
|
"\"\"\"\n",
|
|
"mail = mailparser.parse_from_bytes(byte_mail)\n",
|
|
"mail = mailparser.parse_from_file(f)\n",
|
|
"mail = mailparser.parse_from_file_msg(outlook_mail)\n",
|
|
"mail = mailparser.parse_from_file_obj(fp)\n",
|
|
"mail = mailparser.parse_from_string(raw_mail)\n",
|
|
"\n",
|
|
"mail.attachments: list of all attachments\n",
|
|
"mail.body\n",
|
|
"mail.date: datetime object in UTC\n",
|
|
"mail.defects: defect RFC not compliance\n",
|
|
"mail.defects_categories: only defects categories\n",
|
|
"mail.delivered_to\n",
|
|
"mail.from_\n",
|
|
"mail.get_server_ipaddress(trust=\"my_server_mail_trust\")\n",
|
|
"mail.headers\n",
|
|
"mail.mail: tokenized mail in a object\n",
|
|
"mail.message: email.message.Message object\n",
|
|
"mail.message_as_string: message as string\n",
|
|
"mail.message_id\n",
|
|
"mail.received\n",
|
|
"mail.subject\n",
|
|
"mail.text_plain: only text plain mail parts in a list\n",
|
|
"mail.text_html: only text html mail parts in a list\n",
|
|
"mail.text_not_managed: all not managed text (check the warning logs to find content subtype)\n",
|
|
"mail.to\n",
|
|
"mail.to_domains\n",
|
|
"mail.timezone: returns the timezone, offset from UTC\n",
|
|
"mail.mail_partial: returns only the mains parts of emails\n",
|
|
" \n",
|
|
"#Write attachments on disc\n",
|
|
"mail.write_attachments(base_path)\n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"\n",
|
|
"raw_email=filename\n",
|
|
"\n",
|
|
"mail = mailparser.parse_from_file(raw_email)\n",
|
|
"\n",
|
|
"print(\"[1]\",mail.from_,type(mail.from_))\n",
|
|
"print(\"[2]\",mail.defects)\n",
|
|
"print(\"[3]\",mail.defects_categories)\n",
|
|
"print(\"[4]\",mail.to)\n",
|
|
"print(\"[5]\",mail.to_domains)\n",
|
|
"print(\"[6]\",mail.received)\n",
|
|
"print(\"[7]\",mail.get_server_ipaddress(trust=\"my_server_mail_trust\"))\n",
|
|
"print(\"[8]\",mail.delivered_to)\n",
|
|
"print(\"[9]\",mail.attachments)\n",
|
|
"print(\"[10]\",mail.subject)\n",
|
|
"print(\"[11]\",mail.text_not_managed)\n",
|
|
"#print(\"[12]\",mail.headers)\n",
|
|
"#print(\"[13]\",mail.mail_partial)\n",
|
|
"#print(\"[14]\",mail.text_plain) #works\n",
|
|
"#print(\"[15]\",mail.text_html) #works\n",
|
|
"print(\"[16]\",mail.message_id)\n",
|
|
"#print(\"[17]\",mail.message_as_string)\n",
|
|
"#print(\"[18]\",mail.body)\n",
|
|
"print(\"[19]\",mail.timezone)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Python functions ready to be incorporated to the analytical framework:\n",
|
|
"\"email_miner_text\" is applied to each email file to extract the metadata for the hunting.\n",
|
|
"\"email_dir_miner\" is an alternative of above function, in case you have already a pile of eml. files to process and also generates the graph of sender- receiver structure to identify the important topological structures in chain mail campaigns."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[*]Duplicates dropped.\n",
|
|
" from \\\n",
|
|
"count 20 \n",
|
|
"unique 16 \n",
|
|
"top [('LOGYTalks - Blockchain & Cryptocurrency Sum... \n",
|
|
"freq 2 \n",
|
|
"\n",
|
|
" mailto \n",
|
|
"count 20 \n",
|
|
"unique 9 \n",
|
|
"top [('', 'karasek.jindrich@gmail.com')] \n",
|
|
"freq 8 \n",
|
|
"[*] CSV file saved, done!\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def email_miner_text(email_file): # takes single eml file and mines the metadata for the analysis.\n",
|
|
" import datetime\n",
|
|
" import json\n",
|
|
" import eml_parser\n",
|
|
" import numpy as np\n",
|
|
" import pandas as pd\n",
|
|
" import os\n",
|
|
" import mailparser\n",
|
|
" #https://pypi.org/project/mail-parser/\n",
|
|
" \"\"\"Dataframe design\"\"\"\n",
|
|
" lcolumns=[\"Filename\",\"messageID\",\"from\",\"subject\",\"mailto\",\"mailtod\",\"received\",\"attachments\",\"text-html\",\"text\"]\n",
|
|
" ledf=pd.DataFrame(columns=lcolumns)\n",
|
|
" try:\n",
|
|
" with open(email_file,'rb') as fhdl:\n",
|
|
" raw_email=fhdl.read()\n",
|
|
" #print(filename)\n",
|
|
" #all the code for edf filling comes here\n",
|
|
" mail = mailparser.parse_from_file(raw_email)\n",
|
|
" data=np.array([str(file),str(mail.message_id),str(mail.from_),str(mail.subject),str(mail.to),str(mail.to_domains),str(mail.received),str(mail.attachments),str(mail.text_html),str(mail.text)])\n",
|
|
" #row=pd.Series(data,index=lcolumns)\n",
|
|
" #ledf=ledf.append(pd.Series(row),ignore_index=True)\n",
|
|
" except:\n",
|
|
" pass\n",
|
|
" #print(ledf)\n",
|
|
" #print(ledf.describe())\n",
|
|
" #print(ledf[\"subject\"],ledf[\"Filename\"])\n",
|
|
" #deduplicate the edgelist\n",
|
|
" #ledf.drop_duplicates(keep='first')\n",
|
|
" #save the edgelists to a csv files\n",
|
|
" #ledf.to_csv(\"/Users/jindrich_karasek/data-science/disinfo-corpus/maily/ledf.csv\", sep=',', encoding='utf-8')\n",
|
|
" #print(mail.message_as_string)\n",
|
|
" #print(mail.text_plain)\n",
|
|
" return mail.text_plain\n",
|
|
"\n",
|
|
"def email_dir_miner(path): #Generates the matrix for the graph of the sending emails structure.\n",
|
|
" \"\"\"Read ALL directory,TOP DOWN from starting point, select all the *.eml files and parse metadata into the dataframe\"\"\"\n",
|
|
" import datetime\n",
|
|
" import json\n",
|
|
" import eml_parser\n",
|
|
" import numpy as np\n",
|
|
" import pandas as pd\n",
|
|
" import os\n",
|
|
" import mailparser\n",
|
|
" import tqdm\n",
|
|
"\n",
|
|
" #https://pypi.org/project/mail-parser/\n",
|
|
" \"\"\"Dataframe design\"\"\"\n",
|
|
" #lcolumns=[\"Filename\",\"messageID\",\"from\",\"subject\",\"mailto\",\"mailtod\",\"received\",\"attachments\",\"text-html\",\"text\"]\n",
|
|
" lcolumns=[\"from\",\"mailto\"]\n",
|
|
" ledf=pd.DataFrame(columns=lcolumns)\n",
|
|
" \"\"\"iterate over the files and add pd series generated from each into the final datatframe\"\"\"\n",
|
|
" #https://www.bogotobogo.com/python/python_traversing_directory_tree_recursively_os_walk.php\n",
|
|
" \n",
|
|
" i=0\n",
|
|
" for pth,subdirs,files in os.walk(path):\n",
|
|
" for name in files:\n",
|
|
" file=os.path.join(pth,name)\n",
|
|
" #print(file)\n",
|
|
" filename=str(str(path)+(os.fsdecode(file)))\n",
|
|
" #print(file)\n",
|
|
" if file.endswith(\".eml\"):\n",
|
|
" i=i+1\n",
|
|
" #print(i)\n",
|
|
" try:\n",
|
|
" with open(file,'rb') as fhdl:\n",
|
|
" raw_email=fhdl.read()\n",
|
|
" #print(filename)\n",
|
|
" #all the code for edf filling comes here\n",
|
|
" mail = mailparser.parse_from_bytes(raw_email)\n",
|
|
" #data=np.array([str(file),str(mail.message_id),str(mail.from_),str(mail.subject),str(mail.to),str(mail.to_domains),str(mail.received),str(mail.attachments),str(mail.text_html)])\n",
|
|
" data={\"Filename\":str(file),\"messageID\":str(mail.message_id),\"from\":str(mail.from_),\"subject\":str(mail.subject),\"mailto\":str(mail.to),\"mailtod\":str(mail.to_domains),\"received\":str(mail.received),\"attachments\":str(mail.attachments),\"text-html\":str(mail.text_html),\"text\":str(mail.text)}\n",
|
|
" #data={\"from\":str(mail.from_).replace('[','').replace(']','').replace('(','').replace(')',''),\"mailto\":str(mail.to).replace(']','').replace('[','').replace('(','').replace(')','')}\n",
|
|
" row=pd.Series(data,index=lcolumns)\n",
|
|
" ledf=ledf.append(pd.Series(row),ignore_index=True)\n",
|
|
" continue\n",
|
|
" except:\n",
|
|
" continue \n",
|
|
" else:\n",
|
|
" continue\n",
|
|
" if i > 10000000:\n",
|
|
" break\n",
|
|
" \n",
|
|
" #Turning ledf into the dataframe-list of emails:\n",
|
|
" email_df1=ledf[\"from\"]\n",
|
|
" email_df2=ledf[\"mailto\"]\n",
|
|
" \n",
|
|
" email_df=email_df1.append(email_df2)\n",
|
|
" \n",
|
|
" #deduplicate the edgelist\n",
|
|
" ledf.drop_duplicates(keep='first')\n",
|
|
" email_df.drop_duplicates(keep='first')\n",
|
|
" print(\"[*]Duplicates dropped.\")\n",
|
|
" #print(edf)\n",
|
|
" print(ledf.describe())\n",
|
|
" #print(ledf[\"subject\"],ledf[\"from\"])\n",
|
|
" #save the edgelists to a csv files\n",
|
|
" ledf_dir=directory+\"edf.csv\"\n",
|
|
" ledf.to_csv(ledf_dir, sep=',', encoding='utf-8')\n",
|
|
" ledf_dir2=directory+\"email_df.csv\"\n",
|
|
" email_df.to_csv(ledf_dir2,sep=',',index=False,header=False)\n",
|
|
" print(\"[*] CSV file saved, done!\")\n",
|
|
"\n",
|
|
"#------------------------------\n",
|
|
"\n",
|
|
"email_dir_miner(\"./emails/\")\n",
|
|
"#print(\"Output of email_miner_text(): \",\"\\n\",email_miner_text(raw_email))\n",
|
|
"#print(\"Output of email_miner_data(): \",\"\\n\",email_miner_data(raw_email))"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3.8.8 ('eolas-py3.8.8')",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.8"
|
|
},
|
|
"vscode": {
|
|
"interpreter": {
|
|
"hash": "7217508cf40a866d2c6d8c05c8a287a7af39b44bc942772df40ff0edc82b5da6"
|
|
}
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|