{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# IMPORT NECESSARY LIBRARIES AND MODULES\n", "import numpy as np\n", "import scipy.stats as stats\n", "import pandas as pd\n", "import re\n", "import time\n", "import datetime\n", "import itertools\n", "import tqdm as tqdm\n", "\n", "%set_env PYTHONHASHSEED=1\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "from sklearn.metrics import roc_auc_score, accuracy_score\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer\n", "from sklearn.neural_network import MLPClassifier\n", "\n", "from gensim.models import Doc2Vec\n", "from gensim.models.doc2vec import Doc2Vec, TaggedDocument\n", "from gensim.parsing.preprocessing import preprocess_string, strip_multiple_whitespaces\n", "from gensim.corpora import Dictionary\n", "from gensim.matutils import corpus2dense" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#\n", "# ALL DATA AND PRE-PROCESSING CODE IS UNAVAILBLE TO UNAUTHORIZED USERS DUE TO PRIVACY AND CONFIDENTIALITY RESTRICTIONS\n", "#" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# IMPORT RAW ORIGINAL DATA AND ESTABLISH TIMESTAMP FOR ANALYTIC PROCESSES\n", "rawData = pd.read_json('ksu_full.json')\n", "today = re.sub(string=str(datetime.datetime.today()), pattern=\"\\W\", repl=\"\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# REQUISITE DATAFRAME COLUMNS AND CONTENTS\n", "# \"segCode\" : (Optional) variable manifest from data cleaning stage indicating samples that are suited (0) or not suited (1) for analysis.\n", "# \"Start Date\" : The datetime the the chat transcript started\n", "# \"Question Type\" : Labels for question-type categories as labelled by VRS operators\n", "# \"READ_1_vs_2\" : Binary values of 0 or 1. Values of 999 used to identify missing data\n", "# \"READ_2_vs_3\" : Binary values of 0 or 1. Values of 999 used to identify missing data\n", "# \"PatronTextString\" : The full text of strictly patron-supplied text drawn from transcript" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# CONVERT, FILTER, AND SORT DATA AS NEEDED\n", "rawData['Start Date'] = pd.to_datetime(rawData['Start Date'],unit='ms')\n", "rawData = rawData[rawData['segCode']==0]\n", "rawData = rawData.sort_index()\n", "rawData = rawData.sort_values(by='Start Date')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# GENERATE PATRON-TEXT SECTIONS FOR ANALYSIS\n", "def getPatronSections(df,breaks):\n", " st = time.time()\n", " \n", " df = df.copy()\n", " corpus = df['PatronTextString'].str.split(pat=\"\\s{1,}\").copy()\n", " \n", " for i in corpus.index:\n", " \n", " df.loc[i,'TRUNC_5'] = \" \".join(corpus.loc[i][:breaks[0]])\n", " df.loc[i,'TRUNC_10'] = \" \".join(corpus.loc[i][:breaks[1]])\n", " df.loc[i,'TRUNC_20'] = \" \".join(corpus.loc[i][:breaks[2]])\n", " \n", " et = time.time() - st\n", " print('{:.2f} : Splitting Patron Lines'.format(et)) \n", " return(df) \n", "\n", "patronSegmentsOptions = [5,10,20]\n", "rawData = getPatronSections(df=rawData,breaks=patronSegmentsOptions)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# DEFINE REGULAR EXPRESSION PATTERNS FOR 'TAG' MODELLING PARAMETER\n", "manualTags = [\n", " ['tagURL',[\n", " re.escape('amazon.com'),\n", " re.escape('newfirstsearch'),\n", " re.escape('galegroup'),\n", " re.escape('ingentaconnect.com'),\n", " re.escape('proquest.com'),\n", " re.escape('ncbi.nlm.nih.gov'),\n", " re.escape('sciencedirect.com'),\n", " re.escape('springer.com'),\n", " re.escape('tandfonline.com'),\n", " re.escape('webofknowledge'),\n", " re.escape('wiley.com'),\n", " re.escape('books.google'),\n", " re.escape('google.com'),\n", "\n", " re.escape('apps.lib.k-state.edu/databases'),\n", "\n", " re.escape('er.lib.ksu.edu'),\n", " re.escape('er.lib.k-state.edu'),\n", "\n", " re.escape('getit.lib.ksu.edu'),\n", " re.escape('getit.lib.k-state.edu'),\n", "\n", " re.escape('guides.lib.ksu.edu'),\n", " re.escape('guides.lib.k-state.edu'),\n", "\n", " re.escape('catalog.lib.ksu.edu'),\n", " re.escape('catalog2.lib.ksu.edu'),\n", " re.escape('catalog.lib.k-state.edu'),\n", " re.escape('catalog2.lib.k-state.edu'),\n", "\n", " re.escape('primo.hosted.exlibrisgroup.com'),\n", " re.escape('na02.alma.exlibrisgroup'),\n", "\n", " re.escape('searchit.lib.ksu.edu'),\n", " re.escape('searchit.lib.k-state.edu'),\n", "\n", " re.escape('lib.k-state.edu'),\n", " re.escape('lib.k-state.edu'),\n", "\n", " re.escape('doi.org'),\n", "\n", " re.escape('http'),\n", " re.escape('www.'),]\n", " ],\n", " \n", " ['tagPRINTING',[\n", " 'color print',\n", " 'colored print',\n", " 'print in color',\n", " 'print something in color',\n", " '\\Win color\\W',\n", " 'cat cash',\n", " 'printer',\n", " '(? 0:\n", " listIndex.append(int(i))\n", " else:\n", " notIndex.append(int(i))\n", "\n", " Xlist = representation.loc[listIndex,:]\n", " y_prob_list = neural_model.predict_proba(Xlist)[:,1]\n", " yprob_pred_list = neural_model.predict(Xlist) \n", "\n", " RocAucScore_list = roc_auc_score(y_true=labels.loc[listIndex],y_score=y_prob_list)\n", " AccScore_list = accuracy_score(y_true=labels.loc[listIndex],y_pred=yprob_pred_list)\n", " df.loc[listIndex,'PredictProbList'] = y_prob_list\n", " df.loc[listIndex,'PredictPredList'] = yprob_pred_list\n", "\n", " Xlist = representation.loc[notIndex,:]\n", " y_prob_list = neural_model.predict_proba(Xlist)[:,1]\n", " yprob_pred_list = neural_model.predict(Xlist)\n", "\n", " RocAucScore_NOTlist = roc_auc_score(y_true=labels.loc[notIndex],y_score=y_prob_list)\n", " AccScore_NOTlist = accuracy_score(y_true=labels.loc[notIndex],y_pred=yprob_pred_list)\n", " df.loc[notIndex,'PredictProbNOTList'] = y_prob_list\n", " df.loc[notIndex,'PredictPredNOTList'] = yprob_pred_list \n", " \n", " \n", " \n", " df.loc[labels.index,'PredictProb'] = yprob\n", " df.loc[labels.index,'PredictPred'] = yprob_pred\n", " \n", " et = time.time() - st\n", " print('{:.2f} : Running MLP Fit and Eval'.format(et))\n", " return(neural_model,RocAucScore,RocAucScore_list,RocAucScore_NOTlist,AccScore,AccScore_list,AccScore_NOTlist,len(listIndex),len(notIndex),df.loc[labels.index,:],X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def testModel(df,targetLabels,d2vModel,dictionary,neural_model):\n", " st = time.time()\n", " \n", " df = df.copy()\n", " \n", " labels = df.loc[df[targetLabels]!=999,targetLabels].astype(int).copy()\n", " df_test = df.loc[labels.index,].copy()\n", " \n", " \n", " indic = []\n", " dat = []\n", " for i in df_test.index:\n", " indic.append(i)\n", "# dat.append(d2vModel.infer_vector((df_test.loc[i,'tokenizedTexts']+df_test.loc[i,'manualTags']), steps=100))\n", " dat.append(d2vModel.infer_vector((df_test.loc[i,'tokenizedTexts']), steps=100))\n", "\n", " representation = pd.DataFrame(dat,index=indic)\n", "\n", " X = representation\n", " yTest_prob = neural_model.predict_proba(X)[:,1]\n", " yTest_pred = neural_model.predict(X)\n", " RocAucScore = roc_auc_score(y_true=labels,y_score=yTest_prob)\n", " AccScore = accuracy_score(y_true=labels,y_pred=yTest_pred)\n", " \n", " listIndex = []\n", " notIndex = []\n", " \n", " for i in df.loc[labels.index,].index:\n", " if len(df.loc[i,'manualTags']) > 0:\n", " listIndex.append(int(i))\n", " else:\n", " notIndex.append(int(i))\n", "\n", " Xlist = representation.loc[listIndex,:]\n", " y_prob_list = neural_model.predict_proba(Xlist)[:,1]\n", " yprob_pred_list = neural_model.predict(Xlist) \n", "\n", " RocAucScore_list = roc_auc_score(y_true=labels.loc[listIndex],y_score=y_prob_list)\n", " AccScore_list = accuracy_score(y_true=labels.loc[listIndex],y_pred=yprob_pred_list)\n", " df.loc[listIndex,'PredictProbList'] = y_prob_list\n", " df.loc[listIndex,'PredictPredList'] = yprob_pred_list\n", "\n", "\n", "\n", " Xlist = representation.loc[notIndex,:]\n", " y_prob_list = neural_model.predict_proba(Xlist)[:,1]\n", " yprob_pred_list = neural_model.predict(Xlist)\n", "\n", " RocAucScore_NOTlist = roc_auc_score(y_true=labels.loc[notIndex],y_score=y_prob_list)\n", " AccScore_NOTlist = accuracy_score(y_true=labels.loc[notIndex],y_pred=yprob_pred_list)\n", " df.loc[notIndex,'PredictProbNOTList'] = y_prob_list\n", " df.loc[notIndex,'PredictPredNOTList'] = yprob_pred_list\n", " \n", " \n", " \n", " \n", " df.loc[labels.index,'PredictProb'] = yTest_prob\n", " df.loc[labels.index,'PredictPred'] = yTest_pred\n", " \n", " et = time.time() - st\n", " print('{:.2f} : Testing Model with Holdout Data'.format(et))\n", " return(RocAucScore,RocAucScore_list,RocAucScore_NOTlist,AccScore,AccScore_list,AccScore_NOTlist,len(listIndex),len(notIndex),df.loc[labels.index,:],X) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def prepareTestingData(df,vocab,section):\n", " st = time.time()\n", " \n", " df = df.copy() \n", " df['DocTags'] = [[] for i in range(df.shape[0])] \n", " \n", " \n", " splitStrings = df.loc[:,section].str.lower()\n", " splitStrings = splitStrings.str.split(\"\\W\")\n", " \n", " testTokens = pd.Series(dtype=\"object\")\n", " for i in splitStrings.index:\n", " j = []\n", " for m in splitStrings.loc[i]:\n", " if m in vocab:\n", " j.append(m)\n", "\n", " testTokens.loc[i] = j\n", " df.loc[testTokens.index,'tokenizedTexts'] = testTokens\n", " \n", " \n", " et = time.time() - st\n", " print('{:.2f} : Preparing Test Holdout Data'.format(et))\n", " return(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# DEFINE FUNCTIONS NECESSARY FOR EVALUATION OF MODEL PERFORMANCE ON DATA SUBSETS\n", "qtypes = [\n", " 'Reference',\n", " 'Reserves',\n", " 'Technical',\n", " 'Circulation',\n", " 'Misc',\n", " 'Building',\n", " 'Directional', \n", " 'KREx', \n", " 'ResearchConsultation',\n", " 'NewPrairiePress', \n", " 'KAPI', \n", " 'Copyright', \n", " 'Unknown',\n", "]\n", "\n", "tag_labels = [\n", " \"tagURL\",\n", " \"tagPRINTING\",\n", " \"tagSCANNER\",\n", " \"tagHOURS\",\n", " \"tagLIBMATHPHYS\",\n", " \"tagLIBWEIGEL\",\n", " \"tagLIBVETMED\",\n", " \"tagLIBHALE\",\n", " \"tagLIBSTACKS\",\n", " \"tagTEXTBOOKS\",\n", " \"tagQUIET\",\n", " \"tagLIBLOCATION\",\n", " \"tagARTICLES\",\n", " \"tagEVIDENCEBASED\",\n", " \"tagJUVENILE\",\n", " \"tagCURRICULUM\",\n", " \"tagKNOWNITEMARTICLE\",\n", " \"tagKNOWNITEMBOOK\",\n", " \"tagREFERENCE\",\n", "]\n", "\n", "def ROCAUC(df, col, labelledsection): \n", " labelledsection = labelledsection\n", " df = df.copy() \n", " ytrue = df.loc[df[col]==1,labelledSection]\n", " yprob = df.loc[df[col]==1,'PredictProb'] \n", " ypred = df.loc[df[col]==1,'PredictPred']\n", " \n", " if len(ytrue.unique()) == 1:\n", " try:\n", " rocauc = None\n", " acc = accuracy_score(y_true=ytrue,y_pred=ypred) \n", " except:\n", " rocauc = None\n", " acc = None\n", " else:\n", " try:\n", " rocauc = roc_auc_score(y_true=ytrue,y_score=yprob)\n", " acc = accuracy_score(y_true=ytrue,y_pred=ypred)\n", " except:\n", " rocauc = None\n", " acc = None\n", " return(rocauc, acc)\n", "\n", "\n", "def getFocusedROCAUC(df_train,df_test,qtypes,tag_labels,labelledsection):\n", " st = time.time()\n", " \n", " qtypes = qtypes\n", " tag_labels = tag_labels\n", " labelledsection = labelledsection\n", " \n", " df_train = df_train.copy() \n", " df_test = df_test.copy() \n", " \n", " markers = pd.DataFrame(index=df_train.index, columns=(qtypes+tag_labels))\n", " markers = markers.fillna(0)\n", " df_train = pd.merge(df_train, markers, left_index=True, right_index=True, how=\"outer\")\n", " \n", " markers = pd.DataFrame(index=df_test.index, columns=(qtypes+tag_labels))\n", " markers = markers.fillna(0)\n", " df_test = pd.merge(df_test, markers, left_index=True, right_index=True, how=\"outer\")\n", " \n", " for i in qtypes:\n", " df_train.loc[df_train[\"Question Type\"]==i,i] = 1\n", " df_test.loc[df_test[\"Question Type\"]==i,i] = 1\n", " \n", " \n", " for i in df_train.index:\n", " if len(df_train.loc[i,\"manualTags\"])==0:\n", " continue\n", " else:\n", " for k in df_train.loc[i,\"manualTags\"]:\n", " df_train.loc[i,k] = 1\n", " \n", " for i in df_test.index:\n", " if len(df_test.loc[i,\"manualTags\"])==0:\n", " continue\n", " else:\n", " for k in df_test.loc[i,\"manualTags\"]:\n", " df_test.loc[i,k] = 1\n", " \n", " \n", " tr_Reference = ROCAUC(df = df_train, col = \"Reference\", labelledsection = labelledsection)\n", " tr_Reserves = ROCAUC(df = df_train, col = \"Reserves\", labelledsection = labelledsection)\n", " tr_Technical = ROCAUC(df = df_train, col = \"Technical\", labelledsection = labelledsection)\n", " tr_Circulation = ROCAUC(df = df_train, col = \"Circulation\", labelledsection = labelledsection)\n", " tr_Misc = ROCAUC(df = df_train, col = \"Misc\", labelledsection = labelledsection)\n", " tr_Building = ROCAUC(df = df_train, col = \"Building\", labelledsection = labelledsection)\n", " tr_Directional = ROCAUC(df = df_train, col = \"Directional\", labelledsection = labelledsection)\n", " tr_KREx = ROCAUC(df = df_train, col = \"KREx\", labelledsection = labelledsection)\n", " tr_ResearchConsultation = ROCAUC(df = df_train, col = \"ResearchConsultation\", labelledsection = labelledsection)\n", " tr_NewPrairiePress = ROCAUC(df = df_train, col = \"NewPrairiePress\", labelledsection = labelledsection)\n", " tr_KAPI = ROCAUC(df = df_train, col = \"KAPI\", labelledsection = labelledsection)\n", " tr_Copyright = ROCAUC(df = df_train, col = \"Copyright\", labelledsection = labelledsection)\n", " tr_Unknown = ROCAUC(df = df_train, col = \"Unknown\", labelledsection = labelledsection)\n", " tr_tagURL = ROCAUC(df = df_train, col = \"tagURL\", labelledsection = labelledsection)\n", " tr_tagPRINTING = ROCAUC(df = df_train, col = \"tagPRINTING\", labelledsection = labelledsection)\n", " tr_tagSCANNER = ROCAUC(df = df_train, col = \"tagSCANNER\", labelledsection = labelledsection)\n", " tr_tagHOURS = ROCAUC(df = df_train, col = \"tagHOURS\", labelledsection = labelledsection)\n", " tr_tagLIBMATHPHYS = ROCAUC(df = df_train, col = \"tagLIBMATHPHYS\", labelledsection = labelledsection)\n", " tr_tagLIBWEIGEL = ROCAUC(df = df_train, col = \"tagLIBWEIGEL\", labelledsection = labelledsection)\n", " tr_tagLIBVETMED = ROCAUC(df = df_train, col = \"tagLIBVETMED\", labelledsection = labelledsection)\n", " tr_tagLIBHALE = ROCAUC(df = df_train, col = \"tagLIBHALE\", labelledsection = labelledsection)\n", " tr_tagLIBSTACKS = ROCAUC(df = df_train, col = \"tagLIBSTACKS\", labelledsection = labelledsection)\n", " tr_tagTEXTBOOKS = ROCAUC(df = df_train, col = \"tagTEXTBOOKS\", labelledsection = labelledsection)\n", " tr_tagQUIET = ROCAUC(df = df_train, col = \"tagQUIET\", labelledsection = labelledsection)\n", " tr_tagLIBLOCATION = ROCAUC(df = df_train, col = \"tagLIBLOCATION\", labelledsection = labelledsection)\n", " tr_tagARTICLES = ROCAUC(df = df_train, col = \"tagARTICLES\", labelledsection = labelledsection)\n", " tr_tagEVIDENCEBASED = ROCAUC(df = df_train, col = \"tagEVIDENCEBASED\", labelledsection = labelledsection)\n", " tr_tagJUVENILE = ROCAUC(df = df_train, col = \"tagJUVENILE\", labelledsection = labelledsection)\n", " tr_tagCURRICULUM = ROCAUC(df = df_train, col = \"tagCURRICULUM\", labelledsection = labelledsection)\n", " tr_tagKNOWNITEMARTICLE = ROCAUC(df = df_train, col = \"tagKNOWNITEMARTICLE\", labelledsection = labelledsection)\n", " tr_tagKNOWNITEMBOOK = ROCAUC(df = df_train, col = \"tagKNOWNITEMBOOK\", labelledsection = labelledsection)\n", " tr_tagREFERENCE = ROCAUC(df = df_train, col = \"tagREFERENCE\", labelledsection = labelledsection)\n", "\n", " ts_Reference = ROCAUC(df = df_test, col = \"Reference\", labelledsection = labelledsection)\n", " ts_Reserves = ROCAUC(df = df_test, col = \"Reserves\", labelledsection = labelledsection)\n", " ts_Technical = ROCAUC(df = df_test, col = \"Technical\", labelledsection = labelledsection)\n", " ts_Circulation = ROCAUC(df = df_test, col = \"Circulation\", labelledsection = labelledsection)\n", " ts_Misc = ROCAUC(df = df_test, col = \"Misc\", labelledsection = labelledsection)\n", " ts_Building = ROCAUC(df = df_test, col = \"Building\", labelledsection = labelledsection)\n", " ts_Directional = ROCAUC(df = df_test, col = \"Directional\", labelledsection = labelledsection)\n", " ts_KREx = ROCAUC(df = df_test, col = \"KREx\", labelledsection = labelledsection)\n", " ts_ResearchConsultation = ROCAUC(df = df_test, col = \"ResearchConsultation\", labelledsection = labelledsection)\n", " ts_NewPrairiePress = ROCAUC(df = df_test, col = \"NewPrairiePress\", labelledsection = labelledsection)\n", " ts_KAPI = ROCAUC(df = df_test, col = \"KAPI\", labelledsection = labelledsection)\n", " ts_Copyright = ROCAUC(df = df_test, col = \"Copyright\", labelledsection = labelledsection)\n", " ts_Unknown = ROCAUC(df = df_test, col = \"Unknown\", labelledsection = labelledsection)\n", " ts_tagURL = ROCAUC(df = df_test, col = \"tagURL\", labelledsection = labelledsection)\n", " ts_tagPRINTING = ROCAUC(df = df_test, col = \"tagPRINTING\", labelledsection = labelledsection)\n", " ts_tagSCANNER = ROCAUC(df = df_test, col = \"tagSCANNER\", labelledsection = labelledsection)\n", " ts_tagHOURS = ROCAUC(df = df_test, col = \"tagHOURS\", labelledsection = labelledsection)\n", " ts_tagLIBMATHPHYS = ROCAUC(df = df_test, col = \"tagLIBMATHPHYS\", labelledsection = labelledsection)\n", " ts_tagLIBWEIGEL = ROCAUC(df = df_test, col = \"tagLIBWEIGEL\", labelledsection = labelledsection)\n", " ts_tagLIBVETMED = ROCAUC(df = df_test, col = \"tagLIBVETMED\", labelledsection = labelledsection)\n", " ts_tagLIBHALE = ROCAUC(df = df_test, col = \"tagLIBHALE\", labelledsection = labelledsection)\n", " ts_tagLIBSTACKS = ROCAUC(df = df_test, col = \"tagLIBSTACKS\", labelledsection = labelledsection)\n", " ts_tagTEXTBOOKS = ROCAUC(df = df_test, col = \"tagTEXTBOOKS\", labelledsection = labelledsection)\n", " ts_tagQUIET = ROCAUC(df = df_test, col = \"tagQUIET\", labelledsection = labelledsection)\n", " ts_tagLIBLOCATION = ROCAUC(df = df_test, col = \"tagLIBLOCATION\", labelledsection = labelledsection)\n", " ts_tagARTICLES = ROCAUC(df = df_test, col = \"tagARTICLES\", labelledsection = labelledsection)\n", " ts_tagEVIDENCEBASED = ROCAUC(df = df_test, col = \"tagEVIDENCEBASED\", labelledsection = labelledsection)\n", " ts_tagJUVENILE = ROCAUC(df = df_test, col = \"tagJUVENILE\", labelledsection = labelledsection)\n", " ts_tagCURRICULUM = ROCAUC(df = df_test, col = \"tagCURRICULUM\", labelledsection = labelledsection)\n", " ts_tagKNOWNITEMARTICLE = ROCAUC(df = df_test, col = \"tagKNOWNITEMARTICLE\", labelledsection = labelledsection)\n", " ts_tagKNOWNITEMBOOK = ROCAUC(df = df_test, col = \"tagKNOWNITEMBOOK\", labelledsection = labelledsection)\n", " ts_tagREFERENCE = ROCAUC(df = df_test, col = \"tagREFERENCE\", labelledsection = labelledsection) \n", " \n", " \n", " et = time.time() - st\n", " print('{:.2f} : Getting Focused ROC-AUC Scores'.format(et))\n", " return(df_train, df_test, ts_tagREFERENCE, ts_tagKNOWNITEMBOOK, ts_tagKNOWNITEMARTICLE, ts_tagCURRICULUM, ts_tagJUVENILE, ts_tagEVIDENCEBASED, ts_tagARTICLES, ts_tagLIBLOCATION, ts_tagQUIET, ts_tagTEXTBOOKS, ts_tagLIBSTACKS, ts_tagLIBHALE, ts_tagLIBVETMED, ts_tagLIBWEIGEL, ts_tagLIBMATHPHYS, ts_tagHOURS, ts_tagSCANNER, ts_tagPRINTING, ts_tagURL, ts_Unknown, ts_Copyright, ts_KAPI, ts_NewPrairiePress, ts_ResearchConsultation, ts_KREx, ts_Directional, ts_Building, ts_Misc, ts_Circulation, ts_Technical, ts_Reserves, ts_Reference, tr_tagREFERENCE, tr_tagKNOWNITEMBOOK, tr_tagKNOWNITEMARTICLE, tr_tagCURRICULUM, tr_tagJUVENILE, tr_tagEVIDENCEBASED, tr_tagARTICLES, tr_tagLIBLOCATION, tr_tagQUIET, tr_tagTEXTBOOKS, tr_tagLIBSTACKS, tr_tagLIBHALE, tr_tagLIBVETMED, tr_tagLIBWEIGEL, tr_tagLIBMATHPHYS, tr_tagHOURS, tr_tagSCANNER, tr_tagPRINTING, tr_tagURL, tr_Unknown, tr_Copyright, tr_KAPI, tr_NewPrairiePress, tr_ResearchConsultation, tr_KREx, tr_Directional, tr_Building, tr_Misc, tr_Circulation, tr_Technical, tr_Reserves, tr_Reference)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# ESTABLISH EXPERIMENTAL DESIGN STRUCTURE AND SPECIFYING MODELLING PARAMETERS\n", "\n", "\n", "\n", "# RAND\n", "randomOptions = list(np.arange(0,20,1))\n", "\n", "# SPLIT\n", "testingSplits = [\n", " 2000,\n", "]\n", "\n", "# TAG\n", "manualTagOptions = [\n", " True,\n", " False,\n", "]\n", "\n", "# DICT\n", "dictOptions = [ \n", " (False, 1, 300000),\n", " (True, 2, 3000),\n", "]\n", "\n", "# TRUNC\n", "patronSectionOptions = [\n", " 'TRUNC_10',\n", " 'TRUNC_20',\n", "]\n", "\n", "# READ\n", "labelledSectionOptions = [\n", " 'READ_1_vs_2',\n", " 'READ_2_vs_3',\n", "]\n", "\n", "# D2V\n", "modelOptions = [\n", " 'D2V_75',\n", " 'D2V_150',\n", "]\n", "\n", "\n", "optionsList = [\n", " randomOptions,\n", " manualTagOptions,\n", " dictOptions,\n", " patronSectionOptions,\n", " labelledSectionOptions,\n", " modelOptions,\n", "]\n", "\n", "optionsLen = [len(x) for x in optionsList]\n", "\n", "testtotal = np.prod(optionsLen)\n", "iterationcounter = 0\n", "\n", "parameterList = []" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# EXECUTE STEP-WISE CONSTRUCTION AND EVALUATION OF EVERY MODEL\n", "for z in tqdm.tqdm(randomOptions):\n", " randomSeed=z\n", "\n", " for s in testingSplits:\n", " split = s\n", " testsplit = rawData.shape[0]-split\n", " trainData = rawData[:testsplit]\n", " testData = rawData[testsplit:] \n", " trainShape = trainData.shape[0]\n", " testShape = testData.shape[0]\n", "\n", " for e in patronSectionOptions:\n", " patronSection=e \n", " for f in manualTagOptions:\n", " getTags = f\n", " trainDataTagged = getManualTags(df=trainData,\n", " manualTagsList=manualTags,\n", " section=patronSection,\n", " )\n", " testDataTagged = getManualTags(df=testData,\n", " manualTagsList=manualTags,\n", " section=patronSection,\n", " )\n", "\n", " for g in labelledSectionOptions:\n", " labelledSection=g\n", " \n", " for h in modelOptions:\n", " model=h \n", "\n", " for m in dictOptions:\n", " truncate=m \n", " start_time = time.time()\n", " trainDataTRUNC,countDF,tfidfDF,tfidfList,remove,vocab,gendict = getTFIDFlimited(\n", " df=trainDataTagged,\n", " truncate=truncate,\n", " section=patronSection,\n", " controlVocab=customVocab,\n", " model=model,\n", " )\n", " d2vModel, representation = doc2vecModel(\n", " df=trainDataTRUNC,\n", " vecs=h,\n", " random=randomSeed,\n", " tagState=getTags\n", " )\n", " MLPmodel,trainAUC,trainAUClist,trainAUCNOTlist,trainAccScore,trainAccScore_list,trainAccScore_NOTlist,trainLlen,trainNLlen,trainDataFinal,trainFinalRep = trainModel(\n", " df=trainDataTRUNC,\n", " reps=representation,\n", " targetLabels=labelledSection,\n", " random=randomSeed,\n", " )\n", " testDataPrepped = prepareTestingData(\n", " df=testDataTagged,\n", " vocab=vocab,\n", " section=patronSection,\n", " )\n", " testAUC,testAUClist,testAUCNOTlist,testAccScore,testAccScore_list,testAccScore_NOTlist,testLlen,testNLlen,testDataFinal,testFinalRep = testModel(\n", " df=testDataPrepped,\n", " targetLabels=labelledSection,\n", " d2vModel=d2vModel,\n", " dictionary=gendict,\n", " neural_model=MLPmodel,\n", " ) \n", " trainDataFinal, testDataFinal, ts_tagREFERENCE, ts_tagKNOWNITEMBOOK, ts_tagKNOWNITEMARTICLE, ts_tagCURRICULUM, ts_tagJUVENILE, ts_tagEVIDENCEBASED, ts_tagARTICLES, ts_tagLIBLOCATION, ts_tagQUIET, ts_tagTEXTBOOKS, ts_tagLIBSTACKS, ts_tagLIBHALE, ts_tagLIBVETMED, ts_tagLIBWEIGEL, ts_tagLIBMATHPHYS, ts_tagHOURS, ts_tagSCANNER, ts_tagPRINTING, ts_tagURL, ts_Unknown, ts_Copyright, ts_KAPI, ts_NewPrairiePress, ts_ResearchConsultation, ts_KREx, ts_Directional, ts_Building, ts_Misc, ts_Circulation, ts_Technical, ts_Reserves, ts_Reference, tr_tagREFERENCE, tr_tagKNOWNITEMBOOK, tr_tagKNOWNITEMARTICLE, tr_tagCURRICULUM, tr_tagJUVENILE, tr_tagEVIDENCEBASED, tr_tagARTICLES, tr_tagLIBLOCATION, tr_tagQUIET, tr_tagTEXTBOOKS, tr_tagLIBSTACKS, tr_tagLIBHALE, tr_tagLIBVETMED, tr_tagLIBWEIGEL, tr_tagLIBMATHPHYS, tr_tagHOURS, tr_tagSCANNER, tr_tagPRINTING, tr_tagURL, tr_Unknown, tr_Copyright, tr_KAPI, tr_NewPrairiePress, tr_ResearchConsultation, tr_KREx, tr_Directional, tr_Building, tr_Misc, tr_Circulation, tr_Technical, tr_Reserves, tr_Reference = getFocusedROCAUC(\n", " df_train = trainDataFinal,\n", " df_test = testDataFinal,\n", " qtypes = qtypes,\n", " tag_labels = tag_labels,\n", " labelledsection = labelledSection,\n", " )\n", " end_time = time.time()\n", " total_time = end_time-start_time\n", " modelParameters = {\n", " 'TRUNC':patronSection,\n", " 'TAG':getTags,\n", " 'DICT':str(truncate),\n", " 'D2V':model,\n", " 'READ':labelledSection,\n", " \n", " 'AUC_train':trainAUC,\n", " 'AUC_test':testAUC,\n", "\n", " 'AUC_train_TAG':trainAUClist,\n", " 'AUC_test_TAG':testAUClist,\n", " 'AUC_train_Not_TAG':trainAUCNOTlist,\n", " 'AUC_test_Not_TAG':testAUCNOTlist,\n", " \n", " 'ACC_train':trainAccScore,\n", " 'ACC_test':testAccScore,\n", "\n", " 'ACC_train_TAG':trainAccScore_list,\n", " 'ACC_test_TAG':trainAccScore_NOTlist,\n", " 'ACC_train_Not_TAG':testAccScore_list,\n", " 'ACC_test_Not_TAG':testAccScore_NOTlist,\n", "\n", " 'AUC_train_TAG_LEN':trainLlen,\n", " 'AUC_train_TAG_N_LEN':trainNLlen,\n", " 'AUC_test_TAG_LEN':testLlen,\n", " 'AUC_test_TAG_N_LEN':testNLlen,\n", " \n", " \"AUC_tr_Reference\": tr_Reference[0],\n", " \"AUC_tr_Reserves\": tr_Reserves[0],\n", " \"AUC_tr_Technical\": tr_Technical[0],\n", " \"AUC_tr_Circulation\": tr_Circulation[0],\n", " \"AUC_tr_Misc\": tr_Misc[0],\n", " \"AUC_tr_Building\": tr_Building[0],\n", " \"AUC_tr_Directional\": tr_Directional[0],\n", " \"AUC_tr_KREx\": tr_KREx[0],\n", " \"AUC_tr_ResearchConsultation\": tr_ResearchConsultation[0],\n", " \"AUC_tr_NewPrairiePress\": tr_NewPrairiePress[0],\n", " \"AUC_tr_KAPI\": tr_KAPI[0],\n", " \"AUC_tr_Copyright\": tr_Copyright[0],\n", " \"AUC_tr_Unknown\": tr_Unknown[0],\n", " \"AUC_tr_tagURL\": tr_tagURL[0],\n", " \"AUC_tr_tagPRINTING\": tr_tagPRINTING[0],\n", " \"AUC_tr_tagSCANNER\": tr_tagSCANNER[0],\n", " \"AUC_tr_tagHOURS\": tr_tagHOURS[0],\n", " \"AUC_tr_tagLIBMATHPHYS\": tr_tagLIBMATHPHYS[0],\n", " \"AUC_tr_tagLIBWEIGEL\": tr_tagLIBWEIGEL[0],\n", " \"AUC_tr_tagLIBVETMED\": tr_tagLIBVETMED[0],\n", " \"AUC_tr_tagLIBHALE\": tr_tagLIBHALE[0],\n", " \"AUC_tr_tagLIBSTACKS\": tr_tagLIBSTACKS[0],\n", " \"AUC_tr_tagTEXTBOOKS\": tr_tagTEXTBOOKS[0],\n", " \"AUC_tr_tagQUIET\": tr_tagQUIET[0],\n", " \"AUC_tr_tagLIBLOCATION\": tr_tagLIBLOCATION[0],\n", " \"AUC_tr_tagARTICLES\": tr_tagARTICLES[0],\n", " \"AUC_tr_tagEVIDENCEBASED\": tr_tagEVIDENCEBASED[0],\n", " \"AUC_tr_tagJUVENILE\": tr_tagJUVENILE[0],\n", " \"AUC_tr_tagCURRICULUM\": tr_tagCURRICULUM[0],\n", " \"AUC_tr_tagKNOWNITEMARTICLE\": tr_tagKNOWNITEMARTICLE[0],\n", " \"AUC_tr_tagKNOWNITEMBOOK\": tr_tagKNOWNITEMBOOK[0],\n", " \"AUC_tr_tagREFERENCE\": tr_tagREFERENCE[0],\n", " \"AUC_ts_Reference\": ts_Reference[0],\n", " \"AUC_ts_Reserves\": ts_Reserves[0],\n", " \"AUC_ts_Technical\": ts_Technical[0],\n", " \"AUC_ts_Circulation\": ts_Circulation[0],\n", " \"AUC_ts_Misc\": ts_Misc[0],\n", " \"AUC_ts_Building\": ts_Building[0],\n", " \"AUC_ts_Directional\": ts_Directional[0],\n", " \"AUC_ts_KREx\": ts_KREx[0],\n", " \"AUC_ts_ResearchConsultation\": ts_ResearchConsultation[0],\n", " \"AUC_ts_NewPrairiePress\": ts_NewPrairiePress[0],\n", " \"AUC_ts_KAPI\": ts_KAPI[0],\n", " \"AUC_ts_Copyright\": ts_Copyright[0],\n", " \"AUC_ts_Unknown\": ts_Unknown[0],\n", " \"AUC_ts_tagURL\": ts_tagURL[0],\n", " \"AUC_ts_tagPRINTING\": ts_tagPRINTING[0],\n", " \"AUC_ts_tagSCANNER\": ts_tagSCANNER[0],\n", " \"AUC_ts_tagHOURS\": ts_tagHOURS[0],\n", " \"AUC_ts_tagLIBMATHPHYS\": ts_tagLIBMATHPHYS[0],\n", " \"AUC_ts_tagLIBWEIGEL\": ts_tagLIBWEIGEL[0],\n", " \"AUC_ts_tagLIBVETMED\": ts_tagLIBVETMED[0],\n", " \"AUC_ts_tagLIBHALE\": ts_tagLIBHALE[0],\n", " \"AUC_ts_tagLIBSTACKS\": ts_tagLIBSTACKS[0],\n", " \"AUC_ts_tagTEXTBOOKS\": ts_tagTEXTBOOKS[0],\n", " \"AUC_ts_tagQUIET\": ts_tagQUIET[0],\n", " \"AUC_ts_tagLIBLOCATION\": ts_tagLIBLOCATION[0],\n", " \"AUC_ts_tagARTICLES\": ts_tagARTICLES[0],\n", " \"AUC_ts_tagEVIDENCEBASED\": ts_tagEVIDENCEBASED[0],\n", " \"AUC_ts_tagJUVENILE\": ts_tagJUVENILE[0],\n", " \"AUC_ts_tagCURRICULUM\": ts_tagCURRICULUM[0],\n", " \"AUC_ts_tagKNOWNITEMARTICLE\": ts_tagKNOWNITEMARTICLE[0],\n", " \"AUC_ts_tagKNOWNITEMBOOK\": ts_tagKNOWNITEMBOOK[0],\n", " \"AUC_ts_tagREFERENCE\": ts_tagREFERENCE[0], \n", " \n", " \"ACC_tr_Reference\": tr_Reference[1],\n", " \"ACC_tr_Reserves\": tr_Reserves[1],\n", " \"ACC_tr_Technical\": tr_Technical[1],\n", " \"ACC_tr_Circulation\": tr_Circulation[1],\n", " \"ACC_tr_Misc\": tr_Misc[1],\n", " \"ACC_tr_Building\": tr_Building[1],\n", " \"ACC_tr_Directional\": tr_Directional[1],\n", " \"ACC_tr_KREx\": tr_KREx[1],\n", " \"ACC_tr_ResearchConsultation\": tr_ResearchConsultation[1],\n", " \"ACC_tr_NewPrairiePress\": tr_NewPrairiePress[1],\n", " \"ACC_tr_KAPI\": tr_KAPI[1],\n", " \"ACC_tr_Copyright\": tr_Copyright[1],\n", " \"ACC_tr_Unknown\": tr_Unknown[1],\n", " \"ACC_tr_tagURL\": tr_tagURL[1],\n", " \"ACC_tr_tagPRINTING\": tr_tagPRINTING[1],\n", " \"ACC_tr_tagSCANNER\": tr_tagSCANNER[1],\n", " \"ACC_tr_tagHOURS\": tr_tagHOURS[1],\n", " \"ACC_tr_tagLIBMATHPHYS\": tr_tagLIBMATHPHYS[1],\n", " \"ACC_tr_tagLIBWEIGEL\": tr_tagLIBWEIGEL[1],\n", " \"ACC_tr_tagLIBVETMED\": tr_tagLIBVETMED[1],\n", " \"ACC_tr_tagLIBHALE\": tr_tagLIBHALE[1],\n", " \"ACC_tr_tagLIBSTACKS\": tr_tagLIBSTACKS[1],\n", " \"ACC_tr_tagTEXTBOOKS\": tr_tagTEXTBOOKS[1],\n", " \"ACC_tr_tagQUIET\": tr_tagQUIET[1],\n", " \"ACC_tr_tagLIBLOCATION\": tr_tagLIBLOCATION[1],\n", " \"ACC_tr_tagARTICLES\": tr_tagARTICLES[1],\n", " \"ACC_tr_tagEVIDENCEBASED\": tr_tagEVIDENCEBASED[1],\n", " \"ACC_tr_tagJUVENILE\": tr_tagJUVENILE[1],\n", " \"ACC_tr_tagCURRICULUM\": tr_tagCURRICULUM[1],\n", " \"ACC_tr_tagKNOWNITEMARTICLE\": tr_tagKNOWNITEMARTICLE[1],\n", " \"ACC_tr_tagKNOWNITEMBOOK\": tr_tagKNOWNITEMBOOK[1],\n", " \"ACC_tr_tagREFERENCE\": tr_tagREFERENCE[1],\n", " \"ACC_ts_Reference\": ts_Reference[1],\n", " \"ACC_ts_Reserves\": ts_Reserves[1],\n", " \"ACC_ts_Technical\": ts_Technical[1],\n", " \"ACC_ts_Circulation\": ts_Circulation[1],\n", " \"ACC_ts_Misc\": ts_Misc[1],\n", " \"ACC_ts_Building\": ts_Building[1],\n", " \"ACC_ts_Directional\": ts_Directional[1],\n", " \"ACC_ts_KREx\": ts_KREx[1],\n", " \"ACC_ts_ResearchConsultation\": ts_ResearchConsultation[1],\n", " \"ACC_ts_NewPrairiePress\": ts_NewPrairiePress[1],\n", " \"ACC_ts_KAPI\": ts_KAPI[1],\n", " \"ACC_ts_Copyright\": ts_Copyright[1],\n", " \"ACC_ts_Unknown\": ts_Unknown[1],\n", " \"ACC_ts_tagURL\": ts_tagURL[1],\n", " \"ACC_ts_tagPRINTING\": ts_tagPRINTING[1],\n", " \"ACC_ts_tagSCANNER\": ts_tagSCANNER[1],\n", " \"ACC_ts_tagHOURS\": ts_tagHOURS[1],\n", " \"ACC_ts_tagLIBMATHPHYS\": ts_tagLIBMATHPHYS[1],\n", " \"ACC_ts_tagLIBWEIGEL\": ts_tagLIBWEIGEL[1],\n", " \"ACC_ts_tagLIBVETMED\": ts_tagLIBVETMED[1],\n", " \"ACC_ts_tagLIBHALE\": ts_tagLIBHALE[1],\n", " \"ACC_ts_tagLIBSTACKS\": ts_tagLIBSTACKS[1],\n", " \"ACC_ts_tagTEXTBOOKS\": ts_tagTEXTBOOKS[1],\n", " \"ACC_ts_tagQUIET\": ts_tagQUIET[1],\n", " \"ACC_ts_tagLIBLOCATION\": ts_tagLIBLOCATION[1],\n", " \"ACC_ts_tagARTICLES\": ts_tagARTICLES[1],\n", " \"ACC_ts_tagEVIDENCEBASED\": ts_tagEVIDENCEBASED[1],\n", " \"ACC_ts_tagJUVENILE\": ts_tagJUVENILE[1],\n", " \"ACC_ts_tagCURRICULUM\": ts_tagCURRICULUM[1],\n", " \"ACC_ts_tagKNOWNITEMARTICLE\": ts_tagKNOWNITEMARTICLE[1],\n", " \"ACC_ts_tagKNOWNITEMBOOK\": ts_tagKNOWNITEMBOOK[1],\n", " \"ACC_ts_tagREFERENCE\": ts_tagREFERENCE[1], \n", " \n", " 'CycleTime':total_time,\n", " 'RAND':randomSeed,\n", " 'Custom Vocab Len':len(customVocab),\n", " 'Full Vocab Len':len(vocab),\n", " 'Testing Split':testsplit,\n", " 'Training Data Full':trainShape,\n", " 'Testing Data Full':testShape,\n", " 'Training Data Final':trainDataFinal.shape[0],\n", " 'Testing Data Final':testDataFinal.shape[0],\n", " }\n", "\n", " parameterList.append(list(modelParameters.values()))\n", " print(modelParameters.values())\n", " iterationcounter += 1\n", " print(\"{} out of {} complete\".format(iterationcounter,testtotal))\n", "\n", "parameterDataFrame = pd.DataFrame(parameterList,columns=list(modelParameters.keys()))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# SAVE AND STORE RESULTS LOCALLY\n", "parameterDataFrame.to_csv('dataruns/{}_Run.csv'.format(today))\n", "trainDataFinal.to_csv('dataruns/{}_trainDataFinal.csv'.format(today))\n", "testDataFinal.to_csv('dataruns/{}_testDataFinal.csv'.format(today))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# REOPEN DATA IN NEW DATAFRAME OBJECTS\n", "timestamp = today\n", "\n", "preservedDataFrame = pd.read_csv('dataruns/{}_preserveRun.csv'.format(timestamp),index_col=0)\n", "trainDataFinal = pd.read_csv('dataruns/{}_trainDataFinal.csv'.format(timestamp),index_col=0)\n", "testDataFinal = pd.read_csv('dataruns/{}_testDataFinal.csv'.format(timestamp),index_col=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# WRANGLE DATA INTO CONVENIENT FORMS AND PRODUCE DESCRIPTIVE STATISTICS\n", "\n", "newc = [\n", " 'AUC_train',\n", " 'AUC_test',\n", " \n", " 'AUC_train_TAG',\n", " 'AUC_train_Not_TAG', \n", " \n", " 'AUC_test_TAG', \n", " 'AUC_test_Not_TAG', \n", " \n", " 'AUC_tr_Reference',\n", " 'AUC_tr_Reserves',\n", " 'AUC_tr_Technical',\n", " 'AUC_tr_Circulation',\n", " 'AUC_tr_Misc',\n", " 'AUC_tr_Building',\n", " 'AUC_tr_Directional',\n", " 'AUC_tr_KREx',\n", " 'AUC_tr_ResearchConsultation', \n", " 'AUC_tr_NewPrairiePress', \n", " 'AUC_tr_KAPI',\n", " 'AUC_tr_Copyright', \n", " 'AUC_tr_Unknown',\n", " \n", " 'AUC_tr_tagURL',\n", " 'AUC_tr_tagPRINTING', \n", " 'AUC_tr_tagSCANNER', \n", " 'AUC_tr_tagHOURS',\n", " 'AUC_tr_tagLIBMATHPHYS',\n", " 'AUC_tr_tagLIBWEIGEL',\n", " 'AUC_tr_tagLIBVETMED',\n", " 'AUC_tr_tagLIBHALE', \n", " 'AUC_tr_tagLIBSTACKS',\n", " 'AUC_tr_tagTEXTBOOKS',\n", " 'AUC_tr_tagQUIET', \n", " 'AUC_tr_tagLIBLOCATION',\n", " 'AUC_tr_tagARTICLES',\n", " 'AUC_tr_tagEVIDENCEBASED',\n", " 'AUC_tr_tagJUVENILE', \n", " 'AUC_tr_tagCURRICULUM',\n", " 'AUC_tr_tagKNOWNITEMARTICLE', \n", " 'AUC_tr_tagKNOWNITEMBOOK',\n", " 'AUC_tr_tagREFERENCE',\n", " \n", " 'AUC_ts_Reference',\n", " 'AUC_ts_Reserves',\n", " 'AUC_ts_Technical',\n", " 'AUC_ts_Circulation', \n", " 'AUC_ts_Misc',\n", " 'AUC_ts_Building',\n", " 'AUC_ts_Directional', \n", " 'AUC_ts_KREx',\n", " 'AUC_ts_ResearchConsultation',\n", " 'AUC_ts_NewPrairiePress',\n", " 'AUC_ts_KAPI',\n", " 'AUC_ts_Copyright',\n", " 'AUC_ts_Unknown',\n", " \n", " 'AUC_ts_tagURL',\n", " 'AUC_ts_tagPRINTING',\n", " 'AUC_ts_tagSCANNER',\n", " 'AUC_ts_tagHOURS',\n", " 'AUC_ts_tagLIBMATHPHYS',\n", " 'AUC_ts_tagLIBWEIGEL', \n", " 'AUC_ts_tagLIBVETMED',\n", " 'AUC_ts_tagLIBHALE',\n", " 'AUC_ts_tagLIBSTACKS', \n", " 'AUC_ts_tagTEXTBOOKS',\n", " 'AUC_ts_tagQUIET', \n", " 'AUC_ts_tagLIBLOCATION',\n", " 'AUC_ts_tagARTICLES',\n", " 'AUC_ts_tagEVIDENCEBASED',\n", " 'AUC_ts_tagJUVENILE',\n", " 'AUC_ts_tagCURRICULUM',\n", " 'AUC_ts_tagKNOWNITEMARTICLE',\n", " 'AUC_ts_tagKNOWNITEMBOOK',\n", " 'AUC_ts_tagREFERENCE',\n", " 'ACC_train',\n", " 'ACC_test',\n", " \n", " 'ACC_train_TAG',\n", " 'ACC_train_Not_TAG', \n", " \n", " 'ACC_test_TAG', \n", " 'ACC_test_Not_TAG', \n", " \n", " 'ACC_tr_Reference',\n", " 'ACC_tr_Reserves',\n", " 'ACC_tr_Technical',\n", " 'ACC_tr_Circulation',\n", " 'ACC_tr_Misc',\n", " 'ACC_tr_Building',\n", " 'ACC_tr_Directional',\n", " 'ACC_tr_KREx',\n", " 'ACC_tr_ResearchConsultation', \n", " 'ACC_tr_NewPrairiePress', \n", " 'ACC_tr_KAPI',\n", " 'ACC_tr_Copyright', \n", " 'ACC_tr_Unknown',\n", " \n", " 'ACC_tr_tagURL',\n", " 'ACC_tr_tagPRINTING', \n", " 'ACC_tr_tagSCANNER', \n", " 'ACC_tr_tagHOURS',\n", " 'ACC_tr_tagLIBMATHPHYS',\n", " 'ACC_tr_tagLIBWEIGEL',\n", " 'ACC_tr_tagLIBVETMED',\n", " 'ACC_tr_tagLIBHALE', \n", " 'ACC_tr_tagLIBSTACKS',\n", " 'ACC_tr_tagTEXTBOOKS',\n", " 'ACC_tr_tagQUIET', \n", " 'ACC_tr_tagLIBLOCATION',\n", " 'ACC_tr_tagARTICLES',\n", " 'ACC_tr_tagEVIDENCEBASED',\n", " 'ACC_tr_tagJUVENILE', \n", " 'ACC_tr_tagCURRICULUM',\n", " 'ACC_tr_tagKNOWNITEMARTICLE', \n", " 'ACC_tr_tagKNOWNITEMBOOK',\n", " 'ACC_tr_tagREFERENCE',\n", " \n", " 'ACC_ts_Reference',\n", " 'ACC_ts_Reserves',\n", " 'ACC_ts_Technical',\n", " 'ACC_ts_Circulation', \n", " 'ACC_ts_Misc',\n", " 'ACC_ts_Building',\n", " 'ACC_ts_Directional', \n", " 'ACC_ts_KREx',\n", " 'ACC_ts_ResearchConsultation',\n", " 'ACC_ts_NewPrairiePress',\n", " 'ACC_ts_KAPI',\n", " 'ACC_ts_Copyright',\n", " 'ACC_ts_Unknown',\n", " \n", " 'ACC_ts_tagURL',\n", " 'ACC_ts_tagPRINTING',\n", " 'ACC_ts_tagSCANNER',\n", " 'ACC_ts_tagHOURS',\n", " 'ACC_ts_tagLIBMATHPHYS',\n", " 'ACC_ts_tagLIBWEIGEL', \n", " 'ACC_ts_tagLIBVETMED',\n", " 'ACC_ts_tagLIBHALE',\n", " 'ACC_ts_tagLIBSTACKS', \n", " 'ACC_ts_tagTEXTBOOKS',\n", " 'ACC_ts_tagQUIET', \n", " 'ACC_ts_tagLIBLOCATION',\n", " 'ACC_ts_tagARTICLES',\n", " 'ACC_ts_tagEVIDENCEBASED',\n", " 'ACC_ts_tagJUVENILE',\n", " 'ACC_ts_tagCURRICULUM',\n", " 'ACC_ts_tagKNOWNITEMARTICLE',\n", " 'ACC_ts_tagKNOWNITEMBOOK',\n", " 'ACC_ts_tagREFERENCE',\n", " ]\n", "\n", "statDF = pd.DataFrame(index=newc)\n", "\n", "statDF[\"model_count\"] = preservedDataFrame.loc[:,newc].count().copy()\n", "statDF[\"min\"] = preservedDataFrame.loc[:,newc].min().copy()\n", "statDF[\"max\"] = preservedDataFrame.loc[:,newc].max().copy()\n", "statDF[\"mean\"] = preservedDataFrame.loc[:,newc].mean().copy()\n", "statDF[\"std\"] = preservedDataFrame.loc[:,newc].std().copy()\n", "statDF[\"t-score\"] = (statDF[\"mean\"] - 0.5) / (statDF[\"std\"] / np.sqrt(statDF[\"model_count\"]))\n", "statDF[\"p-value\"] = 1 - stats.t.cdf(x=statDF[\"t-score\"], df=statDF[\"model_count\"]-1-5)\n", "\n", "catcolumns = [\n", " 'Reference',\n", " 'Reserves',\n", " 'Technical', \n", " 'Circulation',\n", " 'Misc',\n", " 'Building', \n", " 'Directional',\n", " 'KREx',\n", " 'ResearchConsultation',\n", " 'NewPrairiePress', \n", " 'KAPI',\n", " 'Copyright', \n", " 'Unknown',\n", " 'tagURL',\n", " 'tagPRINTING',\n", " 'tagSCANNER',\n", " 'tagHOURS',\n", " 'tagLIBMATHPHYS',\n", " 'tagLIBWEIGEL',\n", " 'tagLIBVETMED', \n", " 'tagLIBHALE',\n", " 'tagLIBSTACKS',\n", " 'tagTEXTBOOKS',\n", " 'tagQUIET',\n", " 'tagLIBLOCATION',\n", " 'tagARTICLES',\n", " 'tagEVIDENCEBASED',\n", " 'tagJUVENILE',\n", " 'tagCURRICULUM',\n", " 'tagKNOWNITEMARTICLE', \n", " 'tagKNOWNITEMBOOK', \n", " 'tagREFERENCE'\n", "]\n", "\n", "train_category_counts = trainDataFinal.loc[:,catcolumns].copy().sum()\n", "train_category_counts = train_category_counts.add_prefix(\"AUC_tr_\")\n", "train_category_counts = pd.DataFrame(train_category_counts, columns=[\"n\"])\n", "\n", "test_category_counts = testDataFinal.loc[:,catcolumns].copy().sum()\n", "test_category_counts = test_category_counts.add_prefix(\"AUC_ts_\")\n", "test_category_counts = pd.DataFrame(test_category_counts, columns=[\"n\"])\n", "\n", "counts = pd.concat([train_category_counts,test_category_counts])\n", "\n", "statDF = pd.merge(statDF,counts, left_index=True,right_index=True, how=\"outer\")\n", "\n", "statDF.to_csv(\"dataruns/{}_statblock.csv\".format(timestamp))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# SET DEFAULT SEABORN PLOTTING SETTINGS\n", "sns.set()\n", "\n", "# GENERATE PLOTS FOR ROC AUC PERFORMANCE\n", "# FOCUS: HIGH LEVEL SUBSETS OF SAMPLES\n", "traintest = [\n", " [ \"AUC_train\", \"Training Data\",10162,4],\n", " [ \"AUC_test\", \"Testing Data\",1753,4],\n", "\n", "]\n", "\n", "plt.figure(figsize=(15,10))\n", "\n", "for i in traintest:\n", " ax = sns.distplot(\n", " a=preservedDataFrame[i[0]],\n", " hist=False,\n", " norm_hist=True,\n", " label=\"{} ({} Samples)\".format(i[1],i[2]),\n", " kde_kws={\n", " \"shade\":True,\n", " \"lw\":i[3]\n", " }\n", " )\n", "\n", "ax.set_xlim(0,1)\n", "ax.set_xlabel(\"AUC Score\", fontsize=\"x-large\")\n", "ax.set_ylabel(\"Density\", fontsize=\"x-large\")\n", "\n", "plt.title(\n", " \"Distribution of ROC AUC Scores for full Training and Testing Datasets\", \n", " fontdict={\"fontsize\":\"x-large\"}\n", ")\n", "\n", "plt.axvline( x=0.5, ls=\"--\", c=\"gray\", label=\"0.5 Cutoff\")\n", "plt.legend(fontsize=\"large\")\n", "plt.savefig(\"plots/traintest_{}.png\".format(timestamp))\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# GENERATE PLOTS FOR ROC AUC PERFORMANCE\n", "# FOCUS: SUBSETS OF SAMPLES SEGMENTED BY SELECTED QUESTION TYPES (TRAINING DATA)\n", "qtypes = [\n", " [ \"AUC_tr_Building\", \"Building\",709,2],\n", " [ \"AUC_tr_Circulation\", \"Circulation\",473,2],\n", " [ \"AUC_tr_Directional\", \"Directional\",200,2],\n", " [ \"AUC_tr_Misc\", \"Misc\",917,3],\n", " [ \"AUC_tr_Reference\", \"Reference\",6743,4],\n", " [ \"AUC_tr_Technical\", \"Technical\",907,3],\n", "]\n", "\n", "plt.figure(figsize=(15,10))\n", "\n", "for i in qtypes:\n", " ax = sns.distplot(\n", " a=preservedDataFrame[i[0]],\n", " hist=False,\n", " norm_hist=True,\n", " label=\"{} ({} Samples)\".format(i[1],i[2]),\n", " kde_kws={\n", " \"lw\":i[3]\n", " }\n", " )\n", "\n", "ax.set_xlim(0,1)\n", "ax.set_xlabel(\"AUC Score\", fontsize=\"x-large\")\n", "ax.set_ylabel(\"Density\", fontsize=\"x-large\")\n", "\n", "plt.title(\n", " \"TRAINING DATA - Distribution of ROC AUC Scores for Select Question Types\", \n", " fontdict={\"fontsize\":\"x-large\"}\n", ")\n", "\n", "plt.axvline( x=0.5, ls=\"--\", c=\"gray\", label=\"0.5 Cutoff\")\n", "plt.legend(fontsize=\"x-large\", loc=\"upper left\", shadow=True)\n", "plt.savefig(\"plots/qtypes_training_{}.png\".format(timestamp))\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# GENERATE PLOTS FOR ROC AUC PERFORMANCE\n", "# FOCUS: SUBSETS OF SAMPLES SEGMENTED BY SELECTED QUESTION TYPES (TESTING DATA)\n", "qtypes = qtypes = [\n", " [ \"AUC_ts_Building\", \"Building\",70,2],\n", " [ \"AUC_ts_Circulation\", \"Circulation\",88,2],\n", " [ \"AUC_ts_Directional\", \"Directional\",35,2],\n", " [ \"AUC_ts_Misc\", \"Misc\",172,3],\n", " [ \"AUC_ts_Reference\", \"Reference\",1167,4],\n", " [ \"AUC_ts_Technical\", \"Technical\",179,3],\n", "]\n", "\n", "plt.figure(figsize=(15,10))\n", "\n", "for i in qtypes:\n", " ax = sns.distplot(\n", " a=preservedDataFrame[i[0]],\n", " hist=False,\n", " norm_hist=True,\n", " label=\"{} ({} Samples)\".format(i[1],i[2]),\n", " kde_kws={\n", " \"lw\":i[3]\n", " }\n", " )\n", "\n", "ax.set_xlim(0,1)\n", "ax.set_xlabel(\"AUC Score\", fontsize=\"x-large\")\n", "ax.set_ylabel(\"Density\", fontsize=\"x-large\")\n", "\n", "plt.title(\n", " \"TESTING DATA - Distribution of ROC AUC Scores for Select Question Types\", \n", " fontdict={\"fontsize\":\"x-large\"}\n", ")\n", "\n", "plt.axvline( x=0.5, ls=\"--\", c=\"gray\", label=\"0.5 Cutoff\")\n", "plt.legend(fontsize=\"x-large\", loc=\"upper left\", shadow=True)\n", "plt.savefig(\"plots/qtypes_testing_{}.png\".format(timestamp))\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }