import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
def above_below(x):
if x['hammer_price_bp'] < x['estimate_low']:
return 'below_low'
if x['hammer_price_bp'] > x['estimate_high']:
return 'above_high'
if x['hammer_price_bp'] >= x['estimate_low']:
return 'above_low'
def to_guarantor(x):
if (x['guarantee_status'] != 'no guarantee') & (x['estimate_comp'] == 'below_low'):
return 1
else:
return 0
def sold(x):
if (x['hammer_price_bp'] == -1) & (x['guarantee_status'] == 'no guarantee'):
return 0
if (x['hammer_price_bp'] != -1) & (x['guarantee_status'] == 'no guarantee'):
return 1
if x['guarantee_status'] != 'no guarantee':
return 1
Number of unique lots: 34534
Number of unique auctions: 468
Number of works missing low estimate: 258
guar = df.query('guarantee_status != "no guarantee" ')
pct_guar = guar.groupby('guarantee_status')['to_guarantor'].mean()
pct_guar.plot.bar()
plt.ylabel('% guarantor-acquired')
plt.show()
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True,figsize = (15,5))
sns.barplot(x='guarantee_status', y='to_guarantor', hue='year', data=guar, ax=ax1).set(ylabel="% guarantor acquired")
sns.barplot(x='guarantee_status', y='to_guarantor', hue='name', data=guar, ax=ax2).set(ylabel="% guarantor acquired")
plt.show()
g = sns.FacetGrid(guar.query('(over_low > 0)&(over_low < 5)'), col="guarantee_status", height=6)
ax = g.map(sns.histplot, 'over_low')
sold_to_non_guar = guar.query('to_guarantor==0')
g = sns.FacetGrid(sold_to_non_guar.query('(over_low > 0)&(over_low < 5)'), col="guarantee_status", height=6)
ax = g.map(sns.histplot, 'over_low')
# How many samples have no low estimate and were unsold (resulting in over_low = 1)?
len(df.query('(estimate_low == -1) & (hammer_price_bp == -1)'))
# How many of these are guaranteed?
len(df.query('(estimate_low == -1) & (hammer_price_bp == -1) & (guarantee_status != "no guarantee")'))
str_by_guar_status = df.groupby(['guarantee_status'])['sold'].mean()
pd.DataFrame(str_by_guar_status.values, index=str_by_guar_status.index, columns=['STR'])
ax = sns.barplot(x='guarantee_status', y='sold', hue='year', data=df)
by_gstatus = df.groupby('guarantee_status').count()
by_gstatus['name']
by_gstatus['name'].plot.pie(autopct="%.1f%%", explode=[0.4]*3)
plt.ylabel('guarantee_status')
plt.show()
by_year = df.query('guarantee_status != "no guarantee" ').groupby('year')['guarantee_status'].count()
by_year.plot.pie(autopct="%.1f%%")
plt.show()
ax = sns.countplot(x="name", hue="year", data=df).set(title='Sales by Auction House and Year')
ax = sns.countplot(x="name", hue="guarantee_status", data=df).set(title='Guarantee Status by Auction House')
ax = sns.countplot(x="year", hue="guarantee_status", data=df).set(title='Guarantee Status by Year')
by_year_auc_ct = df.groupby('year')['auction_id'].nunique()
by_year_auc_ct.plot.bar()
(df.groupby('auction_id')['sold'].mean() == 0).sum()
zero_str = (df.groupby('auction_id')['sold'].mean() == 0).sort_values(ascending=False).iloc[:159].index
zero_str_df = df[df.auction_id.isin(zero_str)]
len(zero_str_df.query('year < 2021').auction_id.unique())
# True # of auctions without link (manual inputs with unique ids)
len(zero_str_df[zero_str_df.link == 'unknown'].start_date.unique())
df.query('auction_id == "ee5e867d-1dac-5e3c-b9d2-94cd368802e5"')
zero_str_df[zero_str_df.link == 'unknown']
pd.Series(zero_str_df.link.unique()).to_csv('zero_str.csv')
df[df.auction_id == '28418804-a6a8-50c4-9e18-3a6396068413'].link.iloc[0]
ax = sns.barplot(x='year', y='sold', hue='name', data=df).set(title="STR by Auction House and Year")
ax = sns.countplot(x='name',data=guar).set(title='Count of Guaranteed by Auction House')
# % guaranteed selling above the low & high
print('above high: ', df.query('(guarantee_status != "no guarantee")').above_high.mean())
print('above low: ', df.query('(guarantee_status != "no guarantee")').above_low.mean())
above high: 0.3615127919911012
above low: 0.681868743047831
def right_tail_anal(df):
print('# without low estimate: ', (df.estimate_low == -1).sum())
df = df.query('estimate_low != -1')
print('Total # of works: ', len(df))
df = df.query('above_high == 1')
print('Right Tail Median: ', df.over_low.median())
print('Right Tail SD: ', df.over_low.std())
return df
one_five = df.query('(hammer_price_bp > 1000000) & (hammer_price_bp < 5000000)')
under_one = df.query('(hammer_price_bp < 1000000) & (hammer_price_bp > 0) & (over_low < 100) & (estimate_low > 500)')
over_five = df.query('(hammer_price_bp > 5000000)')
ax = sns.histplot(x='over_low', data=right_tail_anal(one_five))
# without low estimate: 2
Total # of works: 1515
Right Tail Median: 2.317857142857143
Right Tail SD: 2.7318802706566485
ax = sns.histplot(x='over_low', data=right_tail_anal(under_one).query('over_low < 30'))
# without low estimate: 0
Total # of works: 24409
Right Tail Median: 2.2916666666666665
Right Tail SD: 2.9369637953734
ax = sns.histplot(x='over_low', data=right_tail_anal(over_five))
# without low estimate: 9
Total # of works: 517
Right Tail Median: 2.02875
Right Tail SD: 2.697156904192016