UNICRI-Decode-CW/5-Email-data-mining.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "5 - Email analysis:\n",
    "Below are code examples that will help you parse the single email message file, or whole directory with malicous emails and exctracts various medatata for further use.\n",
    "It can be combined with previous codes e.g to exctrat named entities, sentiment or even draw a graph."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Directory and exmaple file selection:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#----\n",
    "directory=\"./emails/\"\n",
    "filename=directory+'1.eml'\n",
    "#----"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Working example for mailparser library, that you can use for your for further work:\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1] [('Оргкомитет', 'news@hrsummit.ru')] <class 'list'>\n",
      "[2] []\n",
      "[3] set()\n",
      "[4] [('', 'karasek.jindrich@gmail.com')]\n",
      "[5] ['gmail.com']\n",
      "[6] [{'by': 'smtp368.emlone.com', 'id': 'hed7s62erm86', 'for': '<karasek.jindrich@gmail.com>', 'envelope_from': 'postman3496981@justeml.com', 'date': 'Tue, 15 Nov 2022 08:37:23 +0000 envelope-from <postman3496981@justeml.com>', 'hop': 1, 'date_utc': '2022-11-15T08:37:23', 'delay': 0}, {'from': 'smtp573.emlone.com smtp573.emlone.com. 87.246.187.152', 'by': 'mx.google.com', 'with': 'ESMTPS', 'id': 'i1-20020a2ea361000000b0026fbd8bb585si5659533ljn.64.2022.11.15.00.37.23', 'for': '<karasek.jindrich@gmail.com> version=TLS1_2 cipher=ECDHE-ECDSA-AES128-GCM-SHA256 bits=128/128', 'date': 'Tue, 15 Nov 2022 00:37:23 -0800 PST', 'hop': 2, 'date_utc': '2022-11-15T08:37:23', 'delay': 0.0}, {'by': '2002:a05:6638:59c:b0:375:9b14:11af', 'with': 'SMTP', 'id': 'a28csp3262577jar', 'date': 'Tue, 15 Nov 2022 00:37:24 -0800 PST', 'hop': 3, 'date_utc': '2022-11-15T08:37:24', 'delay': 1.0}]\n",
      "[7] None\n",
      "[8] [('', 'karasek.jindrich@gmail.com')]\n",
      "[9] []\n",
      "[10] Один день до CDO/CDTO Summit & Award 17 НОЯБРЯ 2022\n",
      "[11] []\n",
      "[16] <E1ourRb-P0bP3O-NK@ucs301-ucs-6.msgpanel.com>\n",
      "[19] +0.0\n"
     ]
    }
   ],
   "source": [
    "import mailparser\n",
    "\n",
    "#https://pypi.org/project/mail-parser/\n",
    "\"\"\"\n",
    "mail = mailparser.parse_from_bytes(byte_mail)\n",
    "mail = mailparser.parse_from_file(f)\n",
    "mail = mailparser.parse_from_file_msg(outlook_mail)\n",
    "mail = mailparser.parse_from_file_obj(fp)\n",
    "mail = mailparser.parse_from_string(raw_mail)\n",
    "\n",
    "mail.attachments: list of all attachments\n",
    "mail.body\n",
    "mail.date: datetime object in UTC\n",
    "mail.defects: defect RFC not compliance\n",
    "mail.defects_categories: only defects categories\n",
    "mail.delivered_to\n",
    "mail.from_\n",
    "mail.get_server_ipaddress(trust=\"my_server_mail_trust\")\n",
    "mail.headers\n",
    "mail.mail: tokenized mail in a object\n",
    "mail.message: email.message.Message object\n",
    "mail.message_as_string: message as string\n",
    "mail.message_id\n",
    "mail.received\n",
    "mail.subject\n",
    "mail.text_plain: only text plain mail parts in a list\n",
    "mail.text_html: only text html mail parts in a list\n",
    "mail.text_not_managed: all not managed text (check the warning logs to find content subtype)\n",
    "mail.to\n",
    "mail.to_domains\n",
    "mail.timezone: returns the timezone, offset from UTC\n",
    "mail.mail_partial: returns only the mains parts of emails\n",
    "    \n",
    "#Write attachments on disc\n",
    "mail.write_attachments(base_path)\n",
    "\"\"\"\n",
    "\n",
    "\n",
    "raw_email=filename\n",
    "\n",
    "mail = mailparser.parse_from_file(raw_email)\n",
    "\n",
    "print(\"[1]\",mail.from_,type(mail.from_))\n",
    "print(\"[2]\",mail.defects)\n",
    "print(\"[3]\",mail.defects_categories)\n",
    "print(\"[4]\",mail.to)\n",
    "print(\"[5]\",mail.to_domains)\n",
    "print(\"[6]\",mail.received)\n",
    "print(\"[7]\",mail.get_server_ipaddress(trust=\"my_server_mail_trust\"))\n",
    "print(\"[8]\",mail.delivered_to)\n",
    "print(\"[9]\",mail.attachments)\n",
    "print(\"[10]\",mail.subject)\n",
    "print(\"[11]\",mail.text_not_managed)\n",
    "#print(\"[12]\",mail.headers)\n",
    "#print(\"[13]\",mail.mail_partial)\n",
    "#print(\"[14]\",mail.text_plain) #works\n",
    "#print(\"[15]\",mail.text_html) #works\n",
    "print(\"[16]\",mail.message_id)\n",
    "#print(\"[17]\",mail.message_as_string)\n",
    "#print(\"[18]\",mail.body)\n",
    "print(\"[19]\",mail.timezone)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Python functions ready to be incorporated to the analytical framework:\n",
    "\"email_miner_text\" is applied to each email file to extract the metadata for the hunting.\n",
    "\"email_dir_miner\" is an alternative of above function, in case you have already a pile of eml. files to process and also generates the graph of sender- receiver structure to identify the important topological structures in chain mail campaigns."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[*]Duplicates dropped.\n",
      "                                                     from  \\\n",
      "count                                                  20   \n",
      "unique                                                 16   \n",
      "top     [('LOGYTalks - Blockchain & Cryptocurrency Sum...   \n",
      "freq                                                    2   \n",
      "\n",
      "                                      mailto  \n",
      "count                                     20  \n",
      "unique                                     9  \n",
      "top     [('', 'karasek.jindrich@gmail.com')]  \n",
      "freq                                       8  \n",
      "[*] CSV file saved, done!\n"
     ]
    }
   ],
   "source": [
    "def email_miner_text(email_file): # takes single eml file and mines the metadata for the analysis.\n",
    "    import datetime\n",
    "    import json\n",
    "    import eml_parser\n",
    "    import numpy as np\n",
    "    import pandas as pd\n",
    "    import os\n",
    "    import mailparser\n",
    "    #https://pypi.org/project/mail-parser/\n",
    "    \"\"\"Dataframe design\"\"\"\n",
    "    lcolumns=[\"Filename\",\"messageID\",\"from\",\"subject\",\"mailto\",\"mailtod\",\"received\",\"attachments\",\"text-html\",\"text\"]\n",
    "    ledf=pd.DataFrame(columns=lcolumns)\n",
    "    try:\n",
    "        with open(email_file,'rb') as fhdl:\n",
    "            raw_email=fhdl.read()\n",
    "            #print(filename)\n",
    "            #all the code for edf filling comes here\n",
    "            mail = mailparser.parse_from_file(raw_email)\n",
    "            data=np.array([str(file),str(mail.message_id),str(mail.from_),str(mail.subject),str(mail.to),str(mail.to_domains),str(mail.received),str(mail.attachments),str(mail.text_html),str(mail.text)])\n",
    "            #row=pd.Series(data,index=lcolumns)\n",
    "            #ledf=ledf.append(pd.Series(row),ignore_index=True)\n",
    "    except:\n",
    "        pass\n",
    "    #print(ledf)\n",
    "    #print(ledf.describe())\n",
    "    #print(ledf[\"subject\"],ledf[\"Filename\"])\n",
    "    #deduplicate the edgelist\n",
    "    #ledf.drop_duplicates(keep='first')\n",
    "    #save the edgelists to a csv files\n",
    "    #ledf.to_csv(\"/Users/jindrich_karasek/data-science/disinfo-corpus/maily/ledf.csv\", sep=',', encoding='utf-8')\n",
    "    #print(mail.message_as_string)\n",
    "    #print(mail.text_plain)\n",
    "    return mail.text_plain\n",
    "\n",
    "def email_dir_miner(path): #Generates the matrix for the graph of the sending emails structure.\n",
    "    \"\"\"Read ALL directory,TOP DOWN from starting point, select all the *.eml files and parse metadata into the dataframe\"\"\"\n",
    "    import datetime\n",
    "    import json\n",
    "    import eml_parser\n",
    "    import numpy as np\n",
    "    import pandas as pd\n",
    "    import os\n",
    "    import mailparser\n",
    "    import tqdm\n",
    "\n",
    "    #https://pypi.org/project/mail-parser/\n",
    "    \"\"\"Dataframe design\"\"\"\n",
    "    #lcolumns=[\"Filename\",\"messageID\",\"from\",\"subject\",\"mailto\",\"mailtod\",\"received\",\"attachments\",\"text-html\",\"text\"]\n",
    "    lcolumns=[\"from\",\"mailto\"]\n",
    "    ledf=pd.DataFrame(columns=lcolumns)\n",
    "    \"\"\"iterate over the files and add pd series generated from each into the final datatframe\"\"\"\n",
    "    #https://www.bogotobogo.com/python/python_traversing_directory_tree_recursively_os_walk.php\n",
    "    \n",
    "    i=0\n",
    "    for pth,subdirs,files in os.walk(path):\n",
    "        for name in files:\n",
    "            file=os.path.join(pth,name)\n",
    "            #print(file)\n",
    "            filename=str(str(path)+(os.fsdecode(file)))\n",
    "            #print(file)\n",
    "            if file.endswith(\".eml\"):\n",
    "                i=i+1\n",
    "                #print(i)\n",
    "                try:\n",
    "                    with open(file,'rb') as fhdl:\n",
    "                        raw_email=fhdl.read()\n",
    "                        #print(filename)\n",
    "                        #all the code for edf filling comes here\n",
    "                        mail = mailparser.parse_from_bytes(raw_email)\n",
    "                        #data=np.array([str(file),str(mail.message_id),str(mail.from_),str(mail.subject),str(mail.to),str(mail.to_domains),str(mail.received),str(mail.attachments),str(mail.text_html)])\n",
    "                        data={\"Filename\":str(file),\"messageID\":str(mail.message_id),\"from\":str(mail.from_),\"subject\":str(mail.subject),\"mailto\":str(mail.to),\"mailtod\":str(mail.to_domains),\"received\":str(mail.received),\"attachments\":str(mail.attachments),\"text-html\":str(mail.text_html),\"text\":str(mail.text)}\n",
    "                        #data={\"from\":str(mail.from_).replace('[','').replace(']','').replace('(','').replace(')',''),\"mailto\":str(mail.to).replace(']','').replace('[','').replace('(','').replace(')','')}\n",
    "                        row=pd.Series(data,index=lcolumns)\n",
    "                        ledf=ledf.append(pd.Series(row),ignore_index=True)\n",
    "                        continue\n",
    "                except:\n",
    "                    continue    \n",
    "            else:\n",
    "                continue\n",
    "        if i > 10000000:\n",
    "            break\n",
    "        \n",
    "    #Turning ledf into the dataframe-list of emails:\n",
    "    email_df1=ledf[\"from\"]\n",
    "    email_df2=ledf[\"mailto\"]\n",
    "    \n",
    "    email_df=email_df1.append(email_df2)\n",
    "                          \n",
    "    #deduplicate the edgelist\n",
    "    ledf.drop_duplicates(keep='first')\n",
    "    email_df.drop_duplicates(keep='first')\n",
    "    print(\"[*]Duplicates dropped.\")\n",
    "    #print(edf)\n",
    "    print(ledf.describe())\n",
    "    #print(ledf[\"subject\"],ledf[\"from\"])\n",
    "    #save the edgelists to a csv files\n",
    "    ledf_dir=directory+\"edf.csv\"\n",
    "    ledf.to_csv(ledf_dir, sep=',', encoding='utf-8')\n",
    "    ledf_dir2=directory+\"email_df.csv\"\n",
    "    email_df.to_csv(ledf_dir2,sep=',',index=False,header=False)\n",
    "    print(\"[*] CSV file saved, done!\")\n",
    "\n",
    "#------------------------------\n",
    "\n",
    "email_dir_miner(\"./emails/\")\n",
    "#print(\"Output of email_miner_text(): \",\"\\n\",email_miner_text(raw_email))\n",
    "#print(\"Output of email_miner_data(): \",\"\\n\",email_miner_data(raw_email))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.8 ('eolas-py3.8.8')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  },
  "vscode": {
   "interpreter": {
    "hash": "7217508cf40a866d2c6d8c05c8a287a7af39b44bc942772df40ff0edc82b5da6"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}