Assignment 1 Group 1
General remarks
Throughout the code SQL queries are used to filter and select data. Also pandas is used to convert the data type of the columns with data from text to numeric and to filter/sort data to get insight in max/min/count of a dataset. The code fragments are repeated with different queries. Certain accounts are explored in more depth considering their relations. Where there were interesting findings it has been documented.
Import dependencies
#!/usr/bin/python3
import pandas as pd
import sqlite3
import requests
import numpy as np
import matplotlib.pyplot as plt
Connect to database file
con = sqlite3.connect('./transaction_data.db')
Describe dataset
Database structure
cur = con.cursor()
def describe_table():
cur.execute("select sql from sqlite_master where type = 'table' and name = 'transaction_data';")
print(cur.fetchall()[0][0])
describe_table()
Total rows
# Count the total amount of rows in dataset
length = pd.read_sql_query("SELECT COUNT(*) from transaction_data", con).iloc[0].values[0]
print("The total amount of rows in the dataset is: {}".format(length))
Amount information
# Select the nameOrig and amount
res = pd.read_sql_query("SELECT nameOrig,amount FROM transaction_data;",con)
res[["amount"]] = res[["amount"]].apply(pd.to_numeric)
# Select all amount values of 0.0
res[res["amount"]==0.0].sort_values(by="nameOrig")
# Select a nameOrig from one of the 0.0 transactions to check whether it's correct balancewise
data = pd.read_sql_query("SELECT amount, nameDest, oldbalanceDest, newbalanceDest FROM transaction_data WHERE nameOrig='C539112012'", con)
data[["amount","newbalanceDest","oldbalanceDest"]] = data[["amount","newbalanceDest","oldbalanceDest"]].apply(pd.to_numeric)
data
Unique senders and receivers
It can be seen that the amount of senders and receivers differ from each other in the range of 4 million. Thus there are more senders than receivers.
# Select distinct nameOrig
unique_senders = pd.read_sql_query("SELECT COUNT(DISTINCT nameOrig) FROM transaction_data", con).iloc[0].values[0]
print("The total amount of unique senders is: {}".format(unique_senders))
# Select distinct nameDest
unique_receivers = pd.read_sql_query("SELECT COUNT(DISTINCT nameDest) FROM transaction_data", con).iloc[0].values[0]
print("The total amount of unique receivers is: {}".format(unique_receivers))
Frequency of receiving per destination
res = pd.read_sql_query("SELECT nameDest, COUNT(amount) AS Count FROM transaction_data GROUP BY nameDest ORDER BY Count DESC;",con)
# Boxplot With the outlier
plt.figure()
plt.title("Boxplot of frequency of transactions to nameDest")
plt.xlabel("")
plt.ylabel("Amount of transactions to account")
res.boxplot(column="Count")
plt.show()
# Boxplot After removing the outlier (C52983754)
plt.figure()
plt.title("Boxplot without C52983754")
plt.xlabel("")
plt.ylabel("Amount of transactions to account")
res[res["nameDest"]!="C52983754"].boxplot(column="Count")
plt.show()
res
Inspecting the data of C52983754
# Selecting the data considering the outlier
res = pd.read_sql_query("SELECT nameOrig, nameDest, timestamp, amount FROM transaction_data WHERE nameDest='C52983754' ORDER BY amount DESC;",con)
res
Frequency of sending per origin
# Select count of transactions from certain nameOrig
res = pd.read_sql_query("SELECT nameOrig, COUNT(amount) AS Count FROM transaction_data GROUP BY nameOrig ORDER BY Count DESC;",con)
# Plot a boxplot
plt.figure()
plt.title("Boxplot of frequency of transactions from nameOrig")
plt.xlabel("")
plt.ylabel("Amount of transactions to account")
res.boxplot(column="Count")
plt.show()
res
Inspecting the outlier (C1286084959)
First the outlier is identified,
# Select data where nameDest is C1286084959
res = pd.read_sql_query("SELECT nameOrig, timestamp, nameDest, amount FROM transaction_data WHERE nameDest='C1286084959' ORDER BY timestamp ASC",con)
res[["amount"]] = res[["amount"]].apply(pd.to_numeric)
res["amount"]
Inspecting timestamp meaning
# Select data grouped by timestamp
res = pd.read_sql_query("SELECT nameDest, timestamp, COUNT(*) AS Count FROM transaction_data GROUP BY timestamp ;",con)
# Plot timestamp vs frequency of counts
res.plot(x="timestamp",y="Count")
plt.title("Frequency of transaction")
plt.xlabel("Timestamp")
plt.ylabel("Transactions")
# Select amount per timestamp and plot it
data = pd.read_sql_query("SELECT timestamp, amount FROM transaction_data", con)
data[["amount"]] = data[["amount"]].apply(pd.to_numeric)
data.groupby(by="timestamp").sum().plot()
plt.title("Amount of transaction")
plt.xlabel("Timestamp")
plt.ylabel("Amount transferred")
# Get first 10 values
data.groupby(by="timestamp").sum().sort_values(by="amount", ascending=False)[0:10]
Selecting data of timestamp 304 for further inspection
# Select data of timestamp 304 for further inspection
data = pd.read_sql_query("SELECT nameOrig, nameDest, timestamp, amount FROM transaction_data WHERE timestamp='304'", con)
data[["amount"]] = data[["amount"]].apply(pd.to_numeric)
data.sort_values(by="amount", ascending=False)
Difference between 'M' and 'C' accounts
res = pd.read_sql_query("SELECT nameOrig, COUNT(*) AS Count FROM transaction_data WHERE nameOrig LIKE 'C%' AND nameDest LIKE 'C%' GROUP BY nameOrig;",con)
print("Transactions from C to C: {}".format(res["Count"].sum()))
res = pd.read_sql_query("SELECT nameOrig, COUNT(*) AS Count FROM transaction_data WHERE nameOrig LIKE 'C%' AND nameDest LIKE 'M%' GROUP BY nameOrig;",con)
print("Transactions from C to M: {}".format(res["Count"].sum()))
res = pd.read_sql_query("SELECT nameOrig, COUNT(*) AS Count FROM transaction_data WHERE nameOrig LIKE 'M%' AND nameDest LIKE 'C%' GROUP BY nameOrig;",con)
print("Transactions from M to C: {}".format(res["Count"].sum()))
res = pd.read_sql_query("SELECT nameOrig, COUNT(*) AS Count FROM transaction_data WHERE nameOrig LIKE 'M%' AND nameDest LIKE 'M%' GROUP BY nameOrig;",con)
print("Transactions from M to M: {}".format(res["Count"].sum()))
Transaction pairs
# Select data grouped by pairs
res = pd.read_sql_query("SELECT [nameOrig], [nameDest], COUNT(*) AS [Count] FROM [transaction_data] GROUP BY [nameOrig], [nameDest] ORDER BY [Count] DESC;",con)
res
# Select data of C1286084959 to see the transactions going from C1286084959 to C2342523425
data = pd.read_sql_query("SELECT nameOrig, oldbalanceOrig, newbalanceOrig, oldbalanceDest, newbalanceDest FROM transaction_data WHERE nameOrig='C1286084959'", con)
data[["newbalanceOrig","newbalanceDest","oldbalanceOrig","oldbalanceDest"]] = data[["newbalanceOrig","newbalanceDest","oldbalanceOrig","oldbalanceDest"]].apply(pd.to_numeric)
data
# Select data where nameDest is C1286084959 and present the median
data = pd.read_sql_query("SELECT amount, nameOrig, oldbalanceOrig, newbalanceOrig, oldbalanceDest, newbalanceDest FROM transaction_data WHERE nameDest='C1286084959'", con)
data[["amount","newbalanceOrig","newbalanceDest","oldbalanceOrig","oldbalanceDest"]] = data[["amount","newbalanceOrig","newbalanceDest","oldbalanceOrig","oldbalanceDest"]].apply(pd.to_numeric)
data["amount"].median()
Balances sender and receiver
data = pd.read_sql_query("SELECT amount, oldbalanceOrig, newbalanceOrig FROM transaction_data", con)
data[["amount","newbalanceOrig","oldbalanceOrig"]] = data[["amount","newbalanceOrig","oldbalanceOrig"]].apply(pd.to_numeric)
data["diffNewPlusAm"] = (data["newbalanceOrig"]+data["amount"]-data["oldbalanceOrig"])
data = data[data["diffNewPlusAm"] != 0.0000000000000].sort_values(by="diffNewPlusAm")
(data["diffNewPlusAm"]).hist()
plt.title("Differences in balance of sender")
plt.xlabel("Amount difference")
plt.ylabel("Frequency of differences")
data["diffNewPlusAm"].abs().sum()
data
data = pd.read_sql_query("SELECT amount, oldbalanceDest, newbalanceDest FROM transaction_data", con)
data[["amount","newbalanceDest","oldbalanceDest"]] = data[["amount","newbalanceDest","oldbalanceDest"]].apply(pd.to_numeric)
data["diffNewPlusAm"] = (data["newbalanceDest"]-data["amount"]-data["oldbalanceDest"])
data = data[data["diffNewPlusAm"] != 0.0000000000].sort_values(by="diffNewPlusAm")
(data["diffNewPlusAm"]).hist()
plt.title("Differences in balance of receiver")
plt.xlabel("Amount difference")
plt.ylabel("Frequency")
data["diffNewPlusAm"].abs().sum()
data