# 1 - Import necessary libraries
import os, random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 2 - Retrieve the names of the 22 files and assign them to a List variable
# You would need to filter out other files
# Display the names of these 22 files
Files= os.listdir('CollegeScorecard_Raw_Data')
Files
# 3 - Write code to randomly select one file name out of the 22 files names.
# Hint: generate a random integer between 0 and 21 first
# Display the file name
random_file=random.choice(os.listdir("CollegeScorecard_Raw_Data"))
print ("File Name:",random_file)
File Name: MERGED2011_12_PP.csv
# 4 - Only read the two columns: college name (INSTNM) and in state tution ("TUITIONFEE_IN")
# from this file into a data frame aand display use info() function to display a summary information
df = pd.read_csv(random_file,index_col=None, header=0, usecols=["INSTNM","TUITIONFEE_IN"])
df
# 5 - Find out how many observations have missing value.
print(" \nCount total NaN at each column in a DataFrame : \n\n", df.isnull().sum())
Count total NaN at each column in a DataFrame :
INSTNM 0
TUITIONFEE_IN 3302
dtype: int64
# 6 - Drop those observations with missing value
# Disaply the number of observations afterward
df["TUITIONFEE_IN"] = df["TUITIONFEE_IN"].fillna(0)
print(" \nCount total NaN at each column in a DataFrame : \n\n", df.isnull().sum())
Count total NaN at each column in a DataFrame :
INSTNM 0
TUITIONFEE_IN 0
dtype: int64
# 7 - Find out how many observations have 0 tution.
df[df["TUITIONFEE_IN"]== 0]
# 8 - Drop those observations with 0 tuition
# Disaply the number of observations afterward
df= df[df['TUITIONFEE_IN'] != 0]
df
# 9 - Calculate and display the mean (average) of the tuitions of all the remaining observations
df.describe()
# 10 - Randomly select just 1 observation from the data frame.
# Display the tuition of that observation.
dfr= df.sample()
dfr
# 11 - Calculate the different between the tuition of this observation and
# the mean tuition off all observations calculated earlier.
# Display the difference
TF1= dfr["TUITIONFEE_IN"]
Diff= TF1-(df["TUITIONFEE_IN"].mean())
Diff
# 12 - Repeat 10 to 11 several times get the feel of the size of the difference in means
dfr1= df.sample()
TF2= dfr1["TUITIONFEE_IN"]
Diff= TF2-(df["TUITIONFEE_IN"].mean())
Diff
dfr2= df.sample()
TF3= dfr2["TUITIONFEE_IN"]
Diff= TF3-(df["TUITIONFEE_IN"].mean())
Diff
dfr3= df.sample()
TF4= dfr3["TUITIONFEE_IN"]
Diff= TF4-(df["TUITIONFEE_IN"].mean())
Diff
dfr4= df.sample()
TF5= dfr4["TUITIONFEE_IN"]
Diff= TF5-(df["TUITIONFEE_IN"].mean())
Diff
# 13 - Randomly select 20 observations, display the tuition of these observation.
dfr20= df.sample(20)
dfr20
# 14 - Calculate and displat the mean tuition of these 20 observations
df_mean20= dfr20["TUITIONFEE_IN"].mean()
df_mean20
# 15 - Calculate the different between the average tuition of these 20 observations
# and the mean tuition off all observations calculated earlier.
# Display the difference
Diff2= df_mean20 -(df["TUITIONFEE_IN"].mean())
Diff2
# 16 - Repeat 13 to 15 several times and get the feel of the size of the difference in means
# Compare these differences with the differences calculated earlier with just 1 observation
# Describe your hunch/conclusion
dfr20= df.sample(20)
df_mean20= dfr20["TUITIONFEE_IN"].mean()
Diff3= df_mean20 -(df["TUITIONFEE_IN"].mean())
Diff3
dfr20= df.sample(20)
df_mean20= dfr20["TUITIONFEE_IN"].mean()
Diff4= df_mean20 -(df["TUITIONFEE_IN"].mean())
Diff4
dfr20= df.sample(20)
df_mean20= dfr20["TUITIONFEE_IN"].mean()
Diff5= df_mean20 -(df["TUITIONFEE_IN"].mean())
Diff5
# 17 - write function Code here DIFFM is the mean difference using two variables X and Y
def DiffM (x,y):
df_x=y.sample(x)
mean_x=df_x["TUITIONFEE_IN"].mean()
df_mean=df["TUITIONFEE_IN"].mean()
return mean_x-df_mean
# 18 - test the function by passing a sample size and the name of the dataframe
# Dsplay the return of the function Use sample size 10 and the orgiginal data frame to find the differnce in means
DiffM(10,df)
# 19 - Create a list of sequence numbers from 1 to 50, name it "sample_sizes";
# display it to make sure its members are from 1 to 50
sample_sizes= list(range(1, 51))
print(sample_sizes)
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
# 20 - Create an empty list and name it "means_diff" for differences of population mean and sample means;
# Loop through the list sample_sizes:
# For each element in the sample size list, obtain a random sample of that size from the data frame;
# Calculate the sample mean
# Calculate the difference between the population mean and the sample mean
# Append the difference to the list "means_diff"
# Display the list "means_diff" after the loop is completed
means_diff=[]
for num in sample_sizes:
df_num= df.sample(num)
mean_num=df_num['TUITIONFEE_IN'].mean()
difference=mean_num-(df["TUITIONFEE_IN"].mean())
means_diff.append(difference)
print(means_diff)
[-11156.547420590223, 531.4525794097772, -8520.214087256889, 212.95257940977717, -9308.147420590223, 4912.952579409777, -7840.118849161651, 4575.952579409777, 2733.452579409777, -1125.0474205902228, -3429.8201478629508, -755.7140872568889, -1590.8551128979143, -124.47599201879348, -2183.2807539235564, -3075.422420590223, 565.6290499980132, 3459.3414682986677, -1627.1263679586446, -188.0974205902221, -200.11884916165218, -2832.183784226587, -269.8952466771789, 1569.7025794097772, -215.42742059022203, 1860.7218101790077, -999.4363094791115, -1782.0831348759366, -2229.7543171419475, -336.9807539235553, 2733.065482635584, -149.07867059022283, 3138.664700621899, -479.19447941375256, -745.2617063045091, -1269.8529761457776, 1353.3714983286955, 23.42626362030387, -778.5217795645822, 3162.577579409777, -156.18156693168567, 1335.8573413145386, -663.2450950088278, 1896.6798521370492, 605.98591274311, 825.7134489749951, -2150.015505696605, -51.94325392355677, 1178.2893141036548, -1050.5274205902224]
# 21 - Make a scatter plot with sample size on the x-axs and mean difference on the y-axis
# Observe as the sample size increases, the sample means converge to the population mean.
# Make sure you make the plot large enough
plt.scatter(sample_sizes,means_diff,plt.gcf().set_size_inches(15, 8))
plt.xlabel('Sample size')
plt.ylabel('Mean difference')
plt.title('Sample Size V.S. Differences in Means')
plt.show()
# 22 - repeat 18 to 21 by replacing 50 with a larger number. For example, 300, or even 1000.
# and see how the plot looks.
sample_sizes2= list(range(1, 301))
print(sample_sizes2)
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300]
means_diff=[]
for num in sample_sizes2:
df_num= df.sample(num)
mean_num=df_num['TUITIONFEE_IN'].mean()
difference=mean_num-(df["TUITIONFEE_IN"].mean())
means_diff.append(difference)
print(means_diff)
[513.4525794097772, 3293.452579409777, -400.2140872568889, 3266.702579409777, -1770.5474205902228, 5766.785912743109, -1922.8331348759366, 41.32757940977717, 4526.897023854222, -1835.0474205902228, -2862.365602408405, 6511.035912743109, 9411.760271717469, -3238.904563447366, 2956.852579409777, 1327.1400794097772, -1753.0180088255165, -2210.991865034668, -1679.2316311165378, 914.1525794097779, -2035.7855158283182, -2085.183784226587, -2121.2430727641367, 2702.160912743111, -523.1874205902222, -1150.8551128979143, -3232.8807539235568, -2943.6902777330797, -3517.5129378316014, 1901.7192460764436, 638.2912890871958, 366.7025794097772, -2013.7595418023448, -1194.1944794137526, -2991.318849161651, 444.8970238542224, 249.56068751788553, -2000.7579469060129, 2008.4525794097772, 3153.7525794097764, 527.6720916048998, -610.7617063045091, 1406.1502538283821, -287.4337842265868, -1691.0585317013338, 686.3656228880373, 490.28236664381984, 740.9734127431111, -1261.9555838555298, -1692.5674205902233, 369.4721872529153, 692.2218101790077, -264.0002507789013, -494.1400131828159, -837.1837842265868, -180.0652777330797, 318.9613513396016, -1003.030179210913, -266.2931833020866, -1397.3807539235568, 1200.4525794097772, 286.92032134526016, 1114.3573413145386, -416.1411705902228, 558.944887102085, -2240.941359984163, -578.2489131275361, 410.2319911744835, 251.24968085905311, -155.1474205902232, 1270.6638470154103, 365.1331349653319, -120.02687264501765, 8.276903734102234, -1369.4540872568887, -1070.218473221801, 79.23180018899802, 617.1192460764432, -910.6866610965517, -1393.3724205902236, 682.2797399036044, 1270.7452623366062, -1196.691998903476, 707.9525794097772, 1766.4172852921292, -2391.373001985572, 239.57901619138647, -344.8996933174949, 203.7447142412384, -295.2363094791126, 1898.5514805086787, 1469.9851881054292, -29.138818439685565, 653.1866219629683, 5.705210988724502, -1074.9224205902228, 926.1020639458602, 150.6668651240634, 1192.947528904726, -381.0774205902235, -871.9533611842817, 199.7859127431111, -645.4309157358548, -248.29742059022283, 2070.014484171681, 676.1412586550596, -23.77171965564412, -1032.5937168865185, -805.9327416911401, 227.89803395523086, 755.731858689056, -418.4402777330797, 790.7092165779195, -265.1175960288201, -382.8778553728316, -421.01293783160145, 114.85428881148619, 1365.028850596218, -493.0180088255165, 594.9692460764436, 800.1302653601906, 121.5427433442037, 1646.3631485154674, 1268.8719342484874, -441.94742059022246, -1845.2537697965727, 217.3265951578087, -1006.9692955902228, -1338.4388934584404, 1291.4756563328538, 1286.796090860158, -167.7443902871928, -752.675240139095, -693.3832414857461, -127.62890207170494, 1110.1216970568366, 143.94163050466705, -755.7937974018168, -862.7416651945387, -300.81170630450833, -762.0651510866774, -16.35023749163156, -388.14182618462837, -1497.929365034668, -576.8715585212576, 229.0347711905997, 59.819926348553054, 715.3106875178855, 282.82841833595194, 71.28591274311111, 1165.4062217938827, -559.4816311165378, 594.2957166646793, -454.52144656424935, 536.065482635584, -1073.9063949491974, 563.3124520212423, 1043.5411870047137, 618.0123278374504, 321.6400794097772, 250.3159334470438, -588.3807539235568, 1.7900027226605744, -682.7425425414422, 1032.8889430461404, -1155.3546495058854, -421.42766011118147, -292.9938491616522, -111.64801230619923, 156.8761088215415, 1109.160181748959, 404.5281608051264, -757.0792124977379, -82.19109875114191, 971.2468651240633, 39.003715773413205, 373.07404833633154, -710.7946116014591, -1231.2625043891057, -597.8863094791122, -1023.6523929659143, -433.4155524583548, 142.46897285240084, -786.2593771119627, 259.6147415719388, 599.6084933882721, -1167.632982087549, 9.452579409777172, 563.7435847007819, -205.96321006390644, -1143.4584153546202, 1269.0723710764432, -558.8634827663882, 1098.9989711623548, -311.6551128979154, -9.761706304509062, 513.4424271255139, 776.9071248643231, -935.5725462183636, 340.75257940977644, -553.3384653663416, 89.20010416225159, 26.1570129073134, 665.2270892136985, 1761.803798921972, -373.6590710756591, -143.27205827138278, -357.3310744363771, -419.65268374811785, -759.8093253521274, -176.04979025846842, 225.38654167392815, -427.1859182428052, -295.57545797340026, 172.1828119679176, 32.855357187554546, 520.5631784881189, -158.33641141591033, 626.9228990444808, 608.675306682504, -667.6876920834347, 221.74537220256934, -139.41289144223992, -168.4715277330797, -205.00075392355575, 1353.9924024186275, -1360.0496232333953, 1181.2771408132867, 496.0988676193847, -1249.2952466771785, -114.8937409365426, -1050.4569033488442, 156.2809055900343, -128.88930093210365, -1504.8027397391597, -653.221149403782, 162.69308573889066, 208.36434411566006, -419.2754540630267, 297.68174607644323, -463.38974424167463, -187.19204868939596, 262.08220903940673, -652.9326664918626, -2.6086450800194143, 255.26152249920779, -608.8591614809102, -660.6885496224804, 84.28390471098282, 155.20057940977676, 279.3290734336824, 145.06765877485668, 587.2984292121491, -1183.9450583854978, 884.0565009784041, 248.35101690977717, 1119.5654198767024, -269.46602524138507, -737.9257989686012, -255.1128052056083, 172.98131504196135, 1008.1396023105408, 754.3765337824007, 378.7556097128072, -163.11345832607185, 11.076639560153126, 740.692279784309, -61.424286261864836, -359.5213982853893, -685.7177909605925, -827.4145792618092, -96.8010970608102, -1039.0162850590877, 415.0401706506527, -1.7946933174953301, -446.77930464819474, 113.88579240616673, 11.48495351049678, -1318.257098009577, -93.85813487593623, -165.6826519069491, 336.073146785664, 562.4525794097772, -549.6002374916316, -59.161455677942286, -296.07189611469767, 220.7487466571638, 78.15049607644323, -1459.2013998289767, -6.681903348842752, 783.4731979664775, 229.45942872484557, 445.7392688295731, 946.2859127431111, 580.5102065284209, 369.82420103139884, -358.61812766092953, -232.7957427378733, -167.20962794808293, -880.0207539235562]
plt.scatter(sample_sizes2,means_diff,plt.gcf().set_size_inches(15, 8))
plt.xlabel('Sample size')
plt.ylabel('Mean difference')
plt.title('Sample Size V.S. Differences in Means')
plt.show()