import pandas as pd
import numpy as np
df_small = pd.DataFrame({'aa': ['a', 'a', 'b', 'c', 'd'], 'bb': ['a', 'a', 'b', 'c', 'd']})
df_small
pd.value_counts(df_small['aa'])
df_small['aa'].value_counts().plot(kind='bar')
df_small['aa'].apply(pd.value_counts)
import pandas as pd
import numpy as np
from itertools import cycle
def generate_fake_dataframe(size, cols, col_names = None, intervals = None, seed = None):
categories_dict = {'animals': ['cow', 'rabbit', 'duck', 'shrimp', 'pig', 'goat', 'crab', 'deer', 'bee', 'sheep', 'fish', 'turkey', 'dove', 'chicken', 'horse'],
'names' : ['James', 'Mary', 'Robert', 'Patricia', 'John', 'Jennifer', 'Michael', 'Linda', 'William', 'Elizabeth', 'Ahmed', 'Barbara', 'Richard', 'Susan', 'Salomon', 'Juan Luis'],
'cities' : ['Stockholm', 'Denver', 'Moscow', 'Marseille', 'Palermo', 'Tokyo', 'Lisbon', 'Oslo', 'Nairobi', 'Río de Janeiro', 'Berlin', 'Bogotá', 'Manila', 'Madrid', 'Milwaukee'],
'colors' : ['red', 'orange', 'yellow', 'green', 'blue', 'indigo', 'purple', 'pink', 'silver', 'gold', 'beige', 'brown', 'grey', 'black', 'white']
}
default_intervals = {"i" : (0,10), "f" : (0,100), "c" : ("names", 5), "d" : ("2020-01-01","2020-12-31")}
rng = np.random.default_rng(seed)
first_c = default_intervals["c"][0]
categories_names = cycle([first_c] + [c for c in categories_dict.keys() if c != first_c])
default_intervals["c"] = (categories_names, default_intervals["c"][1])
if isinstance(col_names,list):
assert len(col_names) == len(cols), f"The fake DataFrame should have {len(cols)} columns but col_names is a list with {len(col_names)} elements"
elif col_names is None:
suffix = {"c" : "cat", "i" : "int", "f" : "float", "d" : "date"}
col_names = [f"column_{str(i)}_{suffix.get(col)}" for i, col in enumerate(cols)]
if isinstance(intervals,list):
assert len(intervals) == len(cols), f"The fake DataFrame should have {len(cols)} columns but intervals is a list with {len(intervals)} elements"
else:
if isinstance(intervals,dict):
assert len(set(intervals.keys()) - set(default_intervals.keys())) == 0, f"The intervals parameter has invalid keys"
default_intervals.update(intervals)
intervals = [default_intervals[col] for col in cols]
df = pd.DataFrame()
for col, col_name, interval in zip(cols, col_names, intervals):
if interval is None:
interval = default_intervals[col]
assert (len(interval) == 2 and isinstance(interval, tuple)) or isinstance(interval, list), f"This interval {interval} is neither a tuple of two elements nor a list of strings."
if col in ("i","f","d"):
start, end = interval
if col == "i":
df[col_name] = rng.integers(start, end, size)
elif col == "f":
df[col_name] = rng.uniform(start, end, size)
elif col == "c":
if isinstance(interval, list):
categories = np.array(interval)
else:
cat_family, length = interval
if isinstance(cat_family, cycle):
cat_family = next(cat_family)
assert cat_family in categories_dict.keys(), f"There are no samples for category '{cat_family}'. Consider passing a list of samples or use one of the available categories: {categories_dict.keys()}"
categories = rng.choice(categories_dict[cat_family], length, replace = False, shuffle = True)
df[col_name] = rng.choice(categories, size, shuffle = True)
elif col == "d":
df[col_name] = rng.choice(pd.date_range(start, end), size)
return df
df=generate_fake_dataframe(size = 1600000, cols = "ffffc",
col_names = ["user", "age", "residence", "weight", "pet"],
intervals = [(73.2,95.0), (73.2,95.0),
(1.65,1.95), (73.2,95.0), ("names", 15)])
df
df['pet'].value_counts()
df['pet'].value_counts().plot(kind='bar', )
# this will take a long time to run as the size will be big
#df['pet'].apply(pd.value_counts).plot(kind='bar', )