Lab SD-TSIA214 - Hidden Markov Models
Riccardo Prestigiacomo - Francesco Manca
import numpy as np
""" read emails and store them in a list"""
df = []
for i in range(1,31):
data = np.loadtxt("dat/mail" + str(i) + ".dat", dtype=int)
df.append(data)
""" DATA STRUCTURE INIZIALIZATION """
states = np.array([0,1])
start_prob = np.array([1-1e-100,1e-100])
trans = np.array([[0.999218078035812,0.000781921964187974],[1e-100,1-1e-100]])
emis = np.loadtxt("PerlScriptAndModel/P.text")
def viterbiAlgorithm(obs, states, start_prob, trans, emission_prob, log=False):
"""
Viterbi Algorithm Implementation
Arguments:
- obs: sequence of observation
- states:list of states
- start_prob:vector of the initial probabilities
- trans: transition matrix
- emission_prob: emission probability matrix
Returns:
- seq: sequence of state
"""
start_prob = np.log(start_prob)
trans = np.log(trans)
emission_prob = np.log(emission_prob)
T1 = np.zeros((obs.size, states.size))
T2 = np.zeros((obs.size, states.size), dtype = np.uint8)
for state in states:
T1[0,state] = start_prob[state] + emission_prob[obs[0]][state]
T2[0,state] = 0
for index in range(1,obs.size):
for state in states:
liste = [T1[index-1,start_state] + trans[start_state, state] for start_state in states]
T1[index, state] = np.max(liste) + emission_prob[obs[index]][state]
T2[index, state] = np.argmax(liste)
if(log == False): continue
path = np.zeros(len(obs), dtype= np.uint8)
path[-1] = np.argmax(T1[-1])
for i in range(len(obs)-2,-1,-1):
path[i] = T2[i, path[i+1]]
return path + 1
" test on random email"
obs = df[10] # select mail11
x = viterbiAlgorithm(obs, states, start_prob, trans, emis, log=True)
print(x)
[1 1 1 ... 2 2 2]
"""x = viterbi(obs, states, start_prob, A, emis)for i in range(10,30):
Y = df.loc[i, ::] # select email
Y = Y[~np.isnan(Y)]
result, path = viterbiAlgorithm(Y, A, B, pi)
f = open("dat/"+'path'+str(i+1)+'.txt','w')
for p in result:
f.write(str(p))
"""
"""
Since for the test part of our data we have the precise point of transition, we're gonna compare
those values with the results obtained using our algorithm.
"""
for i in range(1,11):
print("---------------------Mail "+ str(i) + "---------------------")
Y = df[i-1]
result = viterbiAlgorithm(Y, states, start_prob, trans, emis, log = True)
header = "dat/mail" + str(i) + "h.txt"
with open(header) as h:
expected_trans = len(h.read())+1
trans_viterbi = ((np.array(result) == 1).sum()+1)
print("Expected transition between header and body: " + str(expected_trans))
print("Viterbi transition between header and body: " + str(trans_viterbi))
---------------------Mail 1---------------------
Expected transition between header and body: 3612
Viterbi transition between header and body: 3798
---------------------Mail 2---------------------
Expected transition between header and body: 2477
Viterbi transition between header and body: 2447
---------------------Mail 3---------------------
Expected transition between header and body: 2183
Viterbi transition between header and body: 2265
---------------------Mail 4---------------------
Expected transition between header and body: 2297
Viterbi transition between header and body: 2306
---------------------Mail 5---------------------
Expected transition between header and body: 2334
Viterbi transition between header and body: 2304
---------------------Mail 6---------------------
Expected transition between header and body: 2468
Viterbi transition between header and body: 2438
---------------------Mail 7---------------------
Expected transition between header and body: 2522
Viterbi transition between header and body: 2557
---------------------Mail 8---------------------
Expected transition between header and body: 2337
Viterbi transition between header and body: 2307
---------------------Mail 9---------------------
Expected transition between header and body: 2510
Viterbi transition between header and body: 2542
---------------------Mail 10---------------------
Expected transition between header and body: 2848
Viterbi transition between header and body: 2848
""" performance on test set """
def spliceText(path, text):
'''splice the text according to the states path'''
vals, index = np.unique(path, return_index=True)
index = index[1]
return text[0:index], text[index:]
def visualizeHeaderBody(number):
path = viterbiAlgorithm(np.loadtxt('dat/mail'+ str(number) + '.dat', dtype = int), states, start_prob, trans, emis, log = True)
val, index = np.unique(path, return_index=True)
index = index[1]
print('+-------------------------------------- mail %d ----------------------------+' % number)
print('Test result:')
print("state 1: 0 ~ " + str(index-1))
print('state 2:' + str(index) + ' ~ ' + str(len(path)))
with open('dat/mail'+ str(number) + '.txt') as file:
text = file.read()
header,body = spliceText(path,text)
print('<------------------------------ header ------------------------------------------------->')
print('\n'.join(header.splitlines()[0:6]))
print('\n'.join(header.splitlines()[-6:]))
print('<------------------------------ body --------------------------------------------------->')
print('\n'.join(body.splitlines()[0:9]))
print('<------------------------------ End --------------------------------------------------->')
visualizeHeaderBody(11)
visualizeHeaderBody(30)
visualizeHeaderBody(25)
visualizeHeaderBody(24)
+-------------------------------------- mail 11 ----------------------------+
Test result:
state 1: 0 ~ 2851
state 2:2852 ~ 3475
<------------------------------ header ------------------------------------------------->
From spamassassin-devel-admin@lists.sourceforge.net Thu Aug 22 15:25:29 2002
Return-Path: <spamassassin-devel-admin@example.sourceforge.net>
Delivered-To: zzzz@localhost.netnoteinc.com
Received: from localhost (localhost [127.0.0.1])
by phobos.labs.netnoteinc.com (Postfix) with ESMTP id AE2D043F9B
for <zzzz@localhost>; Thu, 22 Aug 2002 10:25:29 -0400 (EDT)
List-Id: SpamAssassin Developers <spamassassin-devel.example.sourceforge.net>
List-Unsubscribe: <https://example.sourceforge.net/lists/listinfo/spamassassin-devel>,
<mailto:spamassassin-devel-request@lists.sourceforge.net?subject=unsubscribe>
List-Archive: <http://www.geocrawler.com/redir-sf.php3?list=spamassassin-devel>
X-Original-Date: Thu, 22 Aug 2002 16:19:48 +0200
Date: Thu, 22 Aug 2002 16:19:48 +0200
<------------------------------ body --------------------------------------------------->
Hello, have you seen and discussed this article and his approach?
Thank you
http://www.paulgraham.com/spam.html
-- "Hell, there are no rules here-- we're trying to accomplish something."
-- Thomas Alva Edison
<------------------------------ End --------------------------------------------------->
+-------------------------------------- mail 30 ----------------------------+
Test result:
state 1: 0 ~ 2173
state 2:2174 ~ 5160
<------------------------------ header ------------------------------------------------->
From ilug-admin@linux.ie Fri Aug 23 11:07:51 2002
Return-Path: <ilug-admin@linux.ie>
Delivered-To: zzzz@localhost.netnoteinc.com
Received: from localhost (localhost [127.0.0.1])
by phobos.labs.netnoteinc.com (Postfix) with ESMTP id 7419C4416C
for <zzzz@localhost>; Fri, 23 Aug 2002 06:06:33 -0400 (EDT)
UAA19403
Sender: ilug-admin@linux.ie
Errors-To: ilug-admin@linux.ie
X-Mailman-Version: 1.1
Precedence: bulk
Li
<------------------------------ body --------------------------------------------------->
st-Id: Irish Linux Users' Group <ilug.linux.ie>
X-Beenthere: ilug@linux.ie
Update on this for anyone that's interested, and because I like closed
threads... nothing worse than an infinite while loop, is there?
I ended up formatting a floppy on my flatmate's (un-networked) P100 running
FAT16 Win95, and mcopied the contents of the bootdisk across. Now I have a
FAT16 Win98 install running alongside Slackware, and can play Metal Gear
<------------------------------ End --------------------------------------------------->
+-------------------------------------- mail 25 ----------------------------+
Test result:
state 1: 0 ~ 2319
state 2:2320 ~ 3238
<------------------------------ header ------------------------------------------------->
From ilug-admin@linux.ie Fri Aug 23 11:07:47 2002
Return-Path: <ilug-admin@linux.ie>
Delivered-To: zzzz@localhost.netnoteinc.com
Received: from localhost (localhost [127.0.0.1])
by phobos.labs.netnoteinc.com (Postfix) with ESMTP id 6F82C4416B
for <zzzz@localhost>; Fri, 23 Aug 2002 06:06:31 -0400 (EDT)
Precedence: bulk
List-Id: Irish Linux Users' Group <ilug.linux.ie>
X-Beenthere: ilug@linux.ie
> On Thu, 22 Aug 2002,
<------------------------------ body --------------------------------------------------->
John P. Looney wrote:
> > Sun's hardware in general is more reliable,
> ROFL. not in our experience.
Well at least our Caps-Lock keys work:
peter@staunton.ie said:
> Another problem. I have a Dell branded keyboard and if I hit Caps-Lock
> twice, the whole machine crashes (in Linux, not Windows) - even the on/
<------------------------------ End --------------------------------------------------->
+-------------------------------------- mail 24 ----------------------------+
Test result:
state 1: 0 ~ 2560
state 2:2561 ~ 3701
<------------------------------ header ------------------------------------------------->
From lejones@ucla.edu Thu Aug 22 18:29:58 2002
Return-Path: <lejones@ucla.edu>
Delivered-To: zzzz@localhost.netnoteinc.com
Received: from localhost (localhost [127.0.0.1])
by phobos.labs.netnoteinc.com (Postfix) with ESMTP id 89B2943F99
for <zzzz@localhost>; Thu, 22 Aug 2002 13:29:49 -0400 (EDT)
List-Unsubscribe: <mailto:zzzzteana-unsubscribe@yahoogroups.com>
Date: Thu, 22 Aug 2002 10:19:48 -0700
Subject: Re: [zzzzteana] Which Muppet Are You?
Reply-To: zzzzteana@yahoogroups.com
Content-Type: text/plain; charset=US-ASCII
Co
<------------------------------ body --------------------------------------------------->
ntent-Transfer-Encoding: 7bit
Hey, it's not easy being green.
leslie
Leslie Ellen Jones, Ph.D.
Jack of All Trades and Doctor of Folklore
lejones@ucla.edu
<------------------------------ End --------------------------------------------------->