 22abaf93d8
			
		
	
	
		22abaf93d8
		
	
	
	
	
		
			
			Took a copy of the current AMITT github repository - we'll be updating this and merging the SPICE branch back in Rebranded to DISARM Moved generated pages to their own folder, to make looking at the repository less confusing
		
			
				
	
	
		
			111 строки
		
	
	
		
			3.7 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			111 строки
		
	
	
		
			3.7 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| {
 | |
|  "cells": [
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "metadata": {},
 | |
|    "source": [
 | |
|     "# fix the problem with excelfile changes\n",
 | |
|     "\n",
 | |
|     "Background: DISARM's master dataset is in an excelfile. Changes in this dont' show up in github, so it's difficult to tell what's changed between versions.  Code below checks for those differences - use this repeatedly until versions align. "
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "import pandas as pd\n",
 | |
|     "import sqlite3 as sql\n",
 | |
|     "from generate_DISARM_pages import Disarm\n",
 | |
|     "import pandas as pd\n",
 | |
|     "import numpy as np\n",
 | |
|     "import os\n",
 | |
|     "from sklearn.feature_extraction.text import CountVectorizer\n",
 | |
|     "pd.set_option('display.max_rows', 1000)\n",
 | |
|     "pd.set_option('display.max_colwidth', -1)\n",
 | |
|     "\n",
 | |
|     "newfile = '../DISARM_MASTER_DATA/DISARM_FRAMEWORKS_MASTER.xlsx'\n",
 | |
|     "oldfile = '../DISARM_MASTER_DATA/DISARM_FRAMEWORKS_MASTER_previous_version.xlsx'\n",
 | |
|     "\n",
 | |
|     "# Load dfs from file\n",
 | |
|     "newdfs = {}\n",
 | |
|     "newxlsx = pd.ExcelFile(newfile)\n",
 | |
|     "for sheetname in newxlsx.sheet_names:\n",
 | |
|     "    newdfs[sheetname] = newxlsx.parse(sheetname)\n",
 | |
|     "    newdfs[sheetname].fillna('', inplace=True)\n",
 | |
|     "\n",
 | |
|     "olddfs = {}\n",
 | |
|     "oldxlsx = pd.ExcelFile(oldfile)\n",
 | |
|     "for sheetname in oldxlsx.sheet_names:\n",
 | |
|     "    olddfs[sheetname] = oldxlsx.parse(sheetname)\n",
 | |
|     "    olddfs[sheetname].fillna('', inplace=True)\n",
 | |
|     "\n",
 | |
|     "addedtables = newdfs.keys() - olddfs.keys()\n",
 | |
|     "losttables = olddfs.keys() - newdfs.keys()\n",
 | |
|     "if len(addedtables) + len(losttables) > 0:\n",
 | |
|     "    print('Table changes: new tables are {}, lost tables are {}'.format(addedtables, losttables))\n",
 | |
|     "\n",
 | |
|     "def investigate_table(table):\n",
 | |
|     "    print('\\n\\nTable {} is changed'.format(table))\n",
 | |
|     "    # Column headings\n",
 | |
|     "    coldiffs = set(newdfs[table].columns).symmetric_difference(set(olddfs[table].columns))\n",
 | |
|     "    if len(coldiffs) > 0:\n",
 | |
|     "        print('column differences: {}'.format(coldiffs))\n",
 | |
|     "    # length\n",
 | |
|     "    if len(newdfs[table]) != len(olddfs[table]):\n",
 | |
|     "        print('length differences: new {} old {}'.format(len(newdfs[table]), len(olddfs[table])))\n",
 | |
|     "\n",
 | |
|     "    # column by column\n",
 | |
|     "    for column in newdfs[table].columns:\n",
 | |
|     "        coldiffs = newdfs[table][column] != olddfs[table][column]\n",
 | |
|     "        if len(newdfs[table][coldiffs]) > 0:\n",
 | |
|     "            print('Differences in column {}'.format(column))\n",
 | |
|     "    return\n",
 | |
|     "\n",
 | |
|     "for table in newdfs.keys():\n",
 | |
|     "    if newdfs[table].equals(olddfs[table]) == False:\n",
 | |
|     "        investigate_table(table)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "# Look at individual table differences\n",
 | |
|     "table = 'countermeasures'\n",
 | |
|     "column = 'summary'\n",
 | |
|     "coldiffs = newdfs[table][column] != olddfs[table][column]\n",
 | |
|     "diffcols = pd.DataFrame()\n",
 | |
|     "diffcols['amitt_id'] = newdfs[table][coldiffs]['amitt_id']\n",
 | |
|     "diffcols['new'] = newdfs[table][coldiffs][column]\n",
 | |
|     "diffcols['old'] = olddfs[table][coldiffs][column]\n",
 | |
|     "diffcols[diffcols['old'] != '']"
 | |
|    ]
 | |
|   }
 | |
|  ],
 | |
|  "metadata": {
 | |
|   "kernelspec": {
 | |
|    "display_name": "Python 3",
 | |
|    "language": "python",
 | |
|    "name": "python3"
 | |
|   },
 | |
|   "language_info": {
 | |
|    "codemirror_mode": {
 | |
|     "name": "ipython",
 | |
|     "version": 3
 | |
|    },
 | |
|    "file_extension": ".py",
 | |
|    "mimetype": "text/x-python",
 | |
|    "name": "python",
 | |
|    "nbconvert_exporter": "python",
 | |
|    "pygments_lexer": "ipython3",
 | |
|    "version": "3.8.3"
 | |
|   }
 | |
|  },
 | |
|  "nbformat": 4,
 | |
|  "nbformat_minor": 4
 | |
| }
 |