import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from entropy_of_text import get_text_information
get_text_information('TEXTCZ1.txt', f=20)
Word count = 222412
Alphabet size = 117
Vocabulary size = 42826
The 20 most frequent words:
,: 0.0620
.: 0.0581
a: 0.0202
v: 0.0182
:: 0.0154
se: 0.0152
na: 0.0119
-: 0.0115
": 0.0113
): 0.0079
(: 0.0079
že: 0.0076
je: 0.0066
o: 0.0064
s: 0.0052
z: 0.0048
do: 0.0044
i: 0.0044
to: 0.0044
get_text_information('TEXTEN1.txt', f=20)
Word count = 221098
Alphabet size = 74
Vocabulary size = 9607
The 20 most frequent words:
,: 0.0666
the: 0.0601
of: 0.0424
.: 0.0255
and: 0.0250
in: 0.0215
to: 0.0206
a: 0.0142
that: 0.0119
;: 0.0097
have: 0.0094
be: 0.0094
as: 0.0093
is: 0.0092
species: 0.0080
which: 0.0080
by: 0.0077
are: 0.0073
or: 0.0073
cz_mess_char = pd.read_csv('TEXTCZ1-mess_up_characters-entropy_of_text.dat')
cz_mess_word = pd.read_csv('TEXTCZ1-mess_up_words-entropy_of_text.dat')
en_mess_char = pd.read_csv('TEXTEN1-mess_up_characters-entropy_of_text.dat')
en_mess_word = pd.read_csv('TEXTEN1-mess_up_words-entropy_of_text.dat')
print(cz_mess_char.round(3).to_latex())
\begin{tabular}{lrrrrr}
\toprule
{} & Probability & Min entropy & Max entropy & Average entropy & Average perplexity \\
\midrule
0 & 0.000 & 4.748 & 4.748 & 4.748 & 26.868 \\
1 & 0.001 & 4.738 & 4.739 & 4.739 & 26.697 \\
2 & 0.010 & 4.657 & 4.661 & 4.659 & 25.259 \\
3 & 0.100 & 4.000 & 4.011 & 4.006 & 16.070 \\
4 & 1.000 & 2.508 & 2.510 & 2.509 & 5.693 \\
5 & 5.000 & 2.508 & 2.509 & 2.509 & 5.691 \\
6 & 10.000 & 2.508 & 2.510 & 2.509 & 5.693 \\
\bottomrule
\end{tabular}
print(en_mess_char.round(3).to_latex())
\begin{tabular}{lrrrrr}
\toprule
{} & Probability & Min entropy & Max entropy & Average entropy & Average perplexity \\
\midrule
0 & 0.000 & 5.287 & 5.287 & 5.287 & 39.055 \\
1 & 0.001 & 5.283 & 5.284 & 5.284 & 38.950 \\
2 & 0.010 & 5.248 & 5.251 & 5.250 & 38.043 \\
3 & 0.100 & 4.725 & 4.737 & 4.732 & 26.582 \\
4 & 1.000 & 1.601 & 1.602 & 1.601 & 3.035 \\
5 & 5.000 & 1.600 & 1.602 & 1.601 & 3.034 \\
6 & 10.000 & 1.601 & 1.602 & 1.601 & 3.034 \\
\bottomrule
\end{tabular}
plt.figure(figsize=(20, 20))
ax = cz_mess_char.plot(x='Probability', y=['Average entropy'], marker='o', logx=True, label=['Czech'])
en_mess_char.plot(ax=ax, x='Probability', y=['Average entropy'], marker='o', logx=True, label=['English'])
plt.xlabel('Probability (log scale)')
plt.ylabel('Conditional entropy')
# plt.title('Czech text character mess up')
plt.savefig('char.eps', format='eps')
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
print(cz_mess_word.round(3).to_latex())
\begin{tabular}{lrrrrr}
\toprule
{} & Probability & Min entropy & Max entropy & Average entropy & Average perplexity \\
\midrule
0 & 0.000 & 4.748 & 4.748 & 4.748 & 26.868 \\
1 & 0.001 & 4.746 & 4.747 & 4.747 & 26.853 \\
2 & 0.010 & 4.738 & 4.741 & 4.739 & 26.704 \\
3 & 0.100 & 4.630 & 4.640 & 4.635 & 24.852 \\
4 & 1.000 & 2.521 & 2.523 & 2.521 & 5.741 \\
5 & 5.000 & 2.519 & 2.524 & 2.521 & 5.741 \\
6 & 10.000 & 2.520 & 2.523 & 2.522 & 5.743 \\
\bottomrule
\end{tabular}
print(en_mess_word.round(3).to_latex())
\begin{tabular}{lrrrrr}
\toprule
{} & Probability & Min entropy & Max entropy & Average entropy & Average perplexity \\
\midrule
0 & 0.000 & 5.287 & 5.287 & 5.287 & 39.055 \\
1 & 0.001 & 5.289 & 5.290 & 5.289 & 39.107 \\
2 & 0.010 & 5.306 & 5.308 & 5.307 & 39.600 \\
3 & 0.100 & 5.454 & 5.463 & 5.457 & 43.924 \\
4 & 1.000 & 4.553 & 4.554 & 4.553 & 23.480 \\
5 & 5.000 & 4.553 & 4.554 & 4.553 & 23.478 \\
6 & 10.000 & 4.553 & 4.554 & 4.554 & 23.486 \\
\bottomrule
\end{tabular}
plt.figure(figsize=(20, 20))
ax = cz_mess_word.plot(x='Probability', y=['Average entropy'], marker='o', logx=True, label=['Czech'])
en_mess_word.plot(ax=ax, x='Probability', y=['Average entropy'], marker='o', logx=True, label=['English'])
plt.xlabel('Probability (log scale)')
plt.ylabel('Conditional entropy')
# plt.title('Czech text character mess up')
plt.savefig('word.eps', format='eps')
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
!python cross_entropy_language_modeling.py TEXTCZ1.txt
Train data lambdas = [3.73438475658215e-25, 1.5125788838842315e-12, 0.0005041322074100572, 0.9994958677910774]
Heldout data lambdas = [0.25305725837783716, 0.4426639269060694, 0.24247432700778807, 0.06180448770830538]
Cross entropy = 10.392703689577282
Tweaked lambdas for add probability 0.1 = [0.22775153254005345, 0.3983975342154625, 0.21822689430700926, 0.15562403893747484]
Cross entropy with add probability 0.1 = 10.404568009870612
Tweaked lambdas for add probability 0.2 = [0.20244580670226975, 0.3541311415248556, 0.19397946160623047, 0.2494435901666443]
Cross entropy with add probability 0.2 = 10.472724117566074
Tweaked lambdas for add probability 0.3 = [0.177140080864486, 0.3098647488342486, 0.16973202890545164, 0.34326314139581376]
Cross entropy with add probability 0.3 = 10.577591291116288
Tweaked lambdas for add probability 0.4 = [0.15183435502670228, 0.26559835614364163, 0.14548459620467283, 0.4370826926249832]
Cross entropy with add probability 0.4 = 10.717439733535752
Tweaked lambdas for add probability 0.5 = [0.1265286291889186, 0.22133196345303477, 0.12123716350389406, 0.5309022438541526]
Cross entropy with add probability 0.5 = 10.897738536995064
Tweaked lambdas for add probability 0.6 = [0.10122290335113487, 0.1770655707624278, 0.09698973080311524, 0.6247217950833222]
Cross entropy with add probability 0.6 = 11.131578585139714
Tweaked lambdas for add probability 0.7 = [0.07591717751335116, 0.13279917807182084, 0.07274229810233643, 0.7185413463124916]
Cross entropy with add probability 0.7 = 11.445958841488201
Tweaked lambdas for add probability 0.8 = [0.05061145167556744, 0.0885327853812139, 0.04849486540155762, 0.8123608975416611]
Cross entropy with add probability 0.8 = 11.903409407894074
Tweaked lambdas for add probability 0.9 = [0.0253057258377837, 0.044266392690606914, 0.024247432700778792, 0.9061804487708306]
Cross entropy with add probability 0.9 = 12.705464163740748
Tweaked lambdas for add probability 0.95 = [0.012652862918891866, 0.022133196345303485, 0.01212371635038941, 0.9530902243854152]
Cross entropy with add probability 0.95 = 13.519506109325736
Tweaked lambdas for add probability 0.99 = [0.002530572583778373, 0.004426639269060697, 0.002424743270077882, 0.990618044877083]
Cross entropy with add probability 0.99 = 15.42584599785755
Tweaked lambdas for mult probability 0.0 = [0.2697276367904423, 0.47182481807527626, 0.2584475451342815, 0.0]
Cross entropy with mult probability 0.0 = 10.549469186530565
Tweaked lambdas for mult probability 0.1 = [0.26806059894918177, 0.46890872895835556, 0.2568502233216321, 0.006180448770830538]
Cross entropy with mult probability 0.1 = 10.479124316078046
Tweaked lambdas for mult probability 0.2 = [0.2663935611079213, 0.4659926398414349, 0.2552529015089828, 0.012360897541661076]
Cross entropy with mult probability 0.2 = 10.454809086697042
Tweaked lambdas for mult probability 0.3 = [0.2647265232666608, 0.46307655072451426, 0.25365557969633346, 0.018541346312491613]
Cross entropy with mult probability 0.3 = 10.438617512951282
Tweaked lambdas for mult probability 0.4 = [0.26305948542540025, 0.46016046160759355, 0.25205825788368413, 0.024721795083322153]
Cross entropy with mult probability 0.4 = 10.42663126656946
Tweaked lambdas for mult probability 0.5 = [0.26139244758413976, 0.4572443724906729, 0.2504609360710348, 0.03090224385415269]
Cross entropy with mult probability 0.5 = 10.417343712438154
Tweaked lambdas for mult probability 0.6 = [0.2597254097428793, 0.4543282833737522, 0.24886361425838546, 0.037082692624983225]
Cross entropy with mult probability 0.6 = 10.40998434436724
Tweaked lambdas for mult probability 0.7 = [0.2580583719016187, 0.45141219425683143, 0.24726629244573606, 0.04326314139581376]
Cross entropy with mult probability 0.7 = 10.404096098558718
Tweaked lambdas for mult probability 0.8 = [0.2563913340603582, 0.4484961051399108, 0.24566897063308674, 0.049443590166644305]
Cross entropy with mult probability 0.8 = 10.399381475725114
Tweaked lambdas for mult probability 0.9 = [0.25472429621909765, 0.4455800160229901, 0.2440716488204374, 0.05562403893747484]
Cross entropy with mult probability 0.9 = 10.39563404454161