# Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

# Read data
dfInvoice = pd.read_csv("accounts.csv")
dfInvoice.head(5)

# countryCode doesn't need to be in int as we won't be doing math operations on it
# Similarly for invoiceNumber
# If there are any data related operations, need to change InvoiceDate later
# invoiceAmount should be in float
# Disputed could be a boolean to make it easier
dfInvoice.dtypes

# Check of any duplicatd rows
dfInvoice[dfInvoice.duplicated()]

# Preview data
dfInvoice.describe()

# DaysLate nulls shouldn't be a problem
# Need to keep in mind with the InvoiceAmount nulls
dfInvoice.isnull().sum()

# There are some invoice amount that are labeled as "missing" or "not reported"
dfInvoice.InvoiceAmount.sort_values(ascending=False)

# Converting all the string to float strings should be in float
# The ones that are non-numeric will return null
dfInvoice["InvoiceAmount"] = pd.to_numeric(dfInvoice['InvoiceAmount'], errors='coerce')

# Check the datatype
dfInvoice.dtypes

# There are more nulls there now, but that should be fine for this question
dfInvoice.isnull().sum()

# Need to group by country code
# And then sort to get the top 3
dfInvoiceByCountry = dfInvoice.groupby("countryCode")["InvoiceAmount"].sum()

# Preview the data
dfInvoiceByCountry

# Bar plot
dfInvoiceByCountry.sort_values(ascending=False).plot.bar()
plt.xlabel("Country")
plt.ylabel("Quantity")
plt.show()
# Piechart
dfInvoiceByCountry.plot.pie(autopct="%.1f%%")
plt.legend()
plt.show()

dfInvoice.columns

# PaperlessBill has a binary identifier (discrete)
dfInvoice.PaperlessBill.unique()

# There were a lot of nulls in DaysLate, can replace with zero
dfInvoice["DaysLate"] = dfInvoice["DaysLate"].fillna(0)

dfInvoicePaper = dfInvoice[dfInvoice['PaperlessBill'] == 'Paper']
dfInvoiceElectronic = dfInvoice[dfInvoice['PaperlessBill'] == 'Electronic']
data = [dfInvoicePaper['DaysLate'], dfInvoiceElectronic['DaysLate']]
plt.boxplot(data, labels=['Paper', 'Electronic'])
plt.ylabel('Days Late')
plt.show()

dfInvoice['Paperless'] = np.where(dfInvoice['PaperlessBill'] == 'Electronic', 1, 0)

dfInvoice.dtypes

# Now let's check the correlation with Kendall's
dfInvoice[['DaysLate', 'Paperless']].corr('kendall')

# Create more columns to check for correlation and regression
dfInvoice['isDisputed'] = np.where(dfInvoice['Disputed'] == 'Yes', 1, 0)
# Create column isLate
dfInvoice['isLate'] = np.where(dfInvoice['DaysLate'] > 0, 1, 0)

# See if we can identify some correlation visually
pd.plotting.scatter_matrix(dfInvoice, figsize=(12, 12))
plt.show()

# Looking more closely on invoiceAmount
plt.scatter(dfInvoice['InvoiceAmount'], dfInvoice['DaysLate'])
plt.xlabel('InvoiceAmount')
plt.ylabel('Dayslate')
plt.show()

# Check Kendall's correlation matrix
dfCorrelation = dfInvoice[['InvoiceAmount', 'Paperless', 'isDisputed', 'isLate']].corr('kendall')
dfCorrelation

dfInvoice = dfInvoice.reset_index()
# There are nulls in the invoice amount we'll replace with with average
avgInvoice = dfInvoice["InvoiceAmount"].mean()
dfInvoice['InvoiceAmount'] = dfInvoice['InvoiceAmount'].fillna(avgInvoice)

# Choose features involved in the prediction
dfX = dfInvoice[['InvoiceAmount', 'Paperless','isDisputed']]
# Choose column to predict
dfY = dfInvoice['isLate']

# Import module for cross validation
from sklearn.model_selection import train_test_split
# Break the data
X_train, X_test, Y_train, Y_test = train_test_split(dfX, dfY, test_size=0.4, random_state=0)

# Import module
from sklearn.linear_model import LinearRegression
# Create linear regression object
linearRegression = LinearRegression()
# Fit data
linearRegression.fit(X_train, Y_train)

# Create a table with this coefficients
pd.DataFrame(linearRegression.coef_, dfX.columns, ['Regression Coeffs'])

# Predict the DaysLate of invoice amount 1,000, is Paperless, and isNotDisputed
# ['InvoiceAmount', 'Paperless','isDisputed']
linearRegression.predict([[1000, 1, 0]])

# Predict the test set
Y_predicted = linearRegression.predict(X_test)

# Scatter plot of the real and the prediction
plt.scatter(Y_test, Y_predicted);
plt.xlabel('Real data')
plt.ylabel('Predicted data')
plt.show()

# Accuracy
linearRegression.score(X_test, Y_test)