import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('graphql_master.csv')
len(df)
df.columns
df.describe()
w_area = df.query('area > 1')
w_area['log_area'] = np.log(w_area.area)
w_area['log_hammer'] = np.log(w_area.hammer_price)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/arraylike.py:358: RuntimeWarning: invalid value encountered in log
result = getattr(ufunc, method)(*inputs, **kwargs)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
top_names = w_area.groupby('name')['medium'].count().sort_values().iloc[-100:].index
top_artist_area = w_area[w_area['name'].isin(top_names)]
corr_func = lambda df: df['log_hammer'].corr(df['log_area'])
top_artist_corr = top_artist_area.groupby('name').apply(corr_func).sort_values(ascending=False)
top_artist_corr
def view_artist(df, artist_name):
df = df.query(f'name == "{artist_name}"')
df.plot.scatter('log_area','log_hammer')
plt.title(f'{artist_name}')
plt.show()
view_artist(top_artist_area, 'Henry Moore')
view_artist(top_artist_area, 'André Kertész')
view_artist(top_artist_area, 'Diane Arbus')
view_artist(top_artist_area, 'Fernando Botero')
view_artist(top_artist_area, 'Camille Pissarro')
view_artist(top_artist_area, 'Thomas Ruff')
data = top_artist_area.query('name == "Camille Pissarro"').dropna()
x = data.log_area.values
y = data.log_hammer.values
np.polyfit(x, y, 2)
ax = sns.histplot(top_artist_corr).set(title='Histogram of Correlations')
ax = sns.countplot(x=pd.cut(top_artist_corr, bins=[0,0.3,0.6,1])).set(title='Percent of Artists with Positive Effect by Correlation')
f,ax = plt.subplots(figsize=(8,8))
mediums = ['painting', 'drawing', 'print', 'sculpture', 'photograph', 'mixed media']
g = sns.scatterplot('log_area', 'log_hammer', hue='medium', ax=ax, data=w_area[w_area.medium.isin(mediums)])
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
FutureWarning
top_painter_names = w_area.query('medium == "painting"').groupby('name')['medium'].count().sort_values().iloc[-500:].index
top_painter_area = w_area[w_area['name'].isin(top_painter_names)]
top_painter_corr = top_painter_area.groupby('name').apply(corr_func).sort_values(ascending=False)
top_painter_corr
ax = sns.countplot(x=pd.cut(top_painter_corr, bins=[0,0.3,0.6,1])).set(title='Count of Works by Correlation')
view_artist(top_painter_area, 'Salvador Dalí')
view_artist(top_painter_area, 'Eliot Hodgkin')
data = top_painter_area.query('name == "Eliot Hodgkin"').dropna()
x = data.log_area.values
y = data.log_hammer.values
np.polyfit(x, y, 2)
view_artist(top_painter_area, 'Elizabeth Peyton')
top_photo_names = w_area.query('medium == "photograph"').groupby('name')['medium'].count().sort_values().iloc[-100:]
top_photo_area = w_area[w_area['name'].isin(top_photo_names.index)].query('medium == "photograph"')
top_photo_corr = top_photo_area.groupby('name').apply(lambda df: df['log_hammer'].corr(df['log_area'])).sort_values(ascending=False)
print(top_photo_names)
top_photo_corr
name
Rineke Dijkstra 64
Edward S. Curtis 64
Peter Lindbergh 64
Mario Giacomelli 64
Joel Sternfeld 65
...
Hiroshi Sugimoto 626
Irving Penn 671
Robert Mapplethorpe 715
Ansel Adams 799
Henri Cartier-Bresson 899
Name: medium, Length: 100, dtype: int64
view_artist(top_photo_area, 'Robert Adams')
## Most prolific photographer in our data
view_artist(top_photo_area, 'Henri Cartier-Bresson')
view_artist(top_photo_area, 'Thomas Struth')
df.groupby('medium')['hammer_price'].count().sort_values(ascending=False)
top_painter_corr[200:250]
df.living.describe()
df.living.hist()