# Misc
import gc
import re
import textwrap
import pickle
from tqdm import tqdm
# Data
import scipy
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
pd.set_option('display.max_columns', 100)
# Visualización
import matplotlib.pyplot as plt
import seaborn as sns
import matplotx
palette = plt.rcParams['axes.prop_cycle'].by_key()['color']
# Machine Learning
from lightfm import LightFM
from lightfm.evaluation import (
precision_at_k,
recall_at_k,
auc_score
)
from category_encoders.binary import BinaryEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import (
preprocessing,
model_selection,
impute,
compose,
pipeline,
metrics,
decomposition
)
def clean_values(value, remove_zeros=True):
if pd.isna(value):
new_value = np.nan
else:
try:
new_value = np.int64(re.sub('[^0-9]', '', str(value))[:9])
except Exception:
new_value = np.nan
if remove_zeros:
return new_value if new_value > 0 else np.nan
else:
return np.nan
%%time
cols_dtypes = {
'ID del Proceso': 'category',
'Nit Entidad': 'category',
'Entidad': 'category',
'Departamento Entidad': 'category',
'Nombre del Procedimiento': object,
'Precio Base': np.float32,
'Modalidad de Contratacion': 'category',
'Estado de Apertura del Proceso': 'category',
'Adjudicado': 'category',
'Nombre del Proveedor Adjudicado': 'category',
'Codigo Principal de Categoria': object,
'NIT del Proveedor Adjudicado': object,
'Tipo de Contrato': 'category'
}
path = '/datasets/licitana/Data/secop_contratos.csv'
contracts = pd.read_csv(path,
usecols=list(cols_dtypes.keys()) + ['Fecha de Publicacion del Proceso'],
chunksize=500000,
na_values=['No Definido', -1, 'No Adjudicado'],
parse_dates=['Fecha de Publicacion del Proceso'],
infer_datetime_format=True,
low_memory=False,
dtype=cols_dtypes)
contracts_clean = []
for chunk in range(4):
contracts_chunk = contracts.get_chunk()
contracts_chunk.dropna(inplace=True, subset=["NIT del Proveedor Adjudicado"])
contracts_clean.append(contracts_chunk)
contracts = pd.concat(contracts_clean, axis=0)
del contracts_clean
contracts['Nit Entidad'] = contracts['Nit Entidad'].apply(clean_values)
contracts['NIT del Proveedor Adjudicado'] = contracts['NIT del Proveedor Adjudicado'].apply(clean_values)
contracts = contracts[
pd.to_numeric(
contracts['NIT del Proveedor Adjudicado'],
errors='coerce',
downcast='integer'
).notnull()
]
contracts['Nit Entidad'] = contracts['Nit Entidad'].astype(
np.int32, errors='raise')
contracts['NIT del Proveedor Adjudicado'] = contracts['NIT del Proveedor Adjudicado'].astype(
np.int32, errors='raise')
contracts.info()
%%time
cols_dtypes = {
'ID Procedimiento': object,
'NIT Entidad': object,
'NIT Proveedor': object,
}
path = '/work/data/01_raw/secop_proponentes.csv'
bidders = pd.read_csv(path,
usecols=list(cols_dtypes.keys()),
dtype=cols_dtypes,
low_memory=False)
bidders['NIT Entidad'] = bidders['NIT Entidad'].apply(clean_values)
bidders['NIT Proveedor'] = bidders['NIT Proveedor'].apply(clean_values)
bidders = bidders[
pd.to_numeric(
bidders['NIT Proveedor'],
errors='coerce',
downcast='integer'
).notnull()
]
bidders['NIT Entidad'] = bidders['NIT Entidad'].astype(np.int32, errors='raise')
bidders['NIT Proveedor'] = bidders['NIT Proveedor'].astype(np.int32, errors='raise')
bidders.info()
%%time
path = '/work/data/01_raw/codigos_unspsc.xlsx'
unspsc = pd.read_excel(path)
unspsc = unspsc.astype(str)
unspsc.info()
gc.collect()
top_10 = (
contracts.groupby('Nombre del Proveedor Adjudicado')
.count().sort_values(by='Entidad', ascending=False)['Entidad'].head(15)
)
with plt.style.context(matplotx.styles.duftify(matplotx.styles.dracula)):
plt.figure(figsize=(9, 7))
sns.barplot(y=top_10.index, x= top_10.values,
color=palette[0])
plt.xlabel('Número de contratos')
plt.ylabel('')
plt.title('Top 10 empresas con más contratos adjudicados', fontdict={'fontsize': 16})
sns.despine()
plt.show()
tipo_contrato = contracts['Tipo de Contrato'].value_counts()
with plt.style.context(matplotx.styles.duftify(matplotx.styles.dracula)):
plt.figure(figsize=(9, 7))
sns.barplot(y=tipo_contrato.index, x=tipo_contrato.values,
order=tipo_contrato.index, color=palette[1])
plt.xlabel('Número de contratos')
plt.ylabel('')
plt.title('Número de contratos por tipo', fontdict={'fontsize': 16})
sns.despine()
plt.legend([])
plt.grid(False)
plt.show()
contracts_year = (
contracts.groupby(contracts['Fecha de Publicacion del Proceso']
.map(lambda x: x.year))
.size()
).loc[:2021]
contracts['Año de Publicacion'] = contracts['Fecha de Publicacion del Proceso'].dt.year
tipos_year = (
contracts.groupby(['Año de Publicacion', 'Tipo de Contrato'])
.size().to_frame().reset_index()
)
tipos_year.columns = ['year', 'type', 'count']
tipos_year = tipos_year.query('year != 2022')
with plt.style.context(matplotx.styles.duftify(matplotx.styles.dracula)):
plt.figure(figsize=(9, 7))
sns.lineplot(x=contracts_year.index, y=contracts_year.values,
marker=None, label='Todos los contratos')
for tipo in list(contracts['Tipo de Contrato'].value_counts().head().index):
sns.lineplot(data=tipos_year.query("type == @tipo"),
x='year', y='count', hue='type',
label=tipo)
plt.title('Contratos por año por tipo', fontdict={'fontsize': 16})
matplotx.ylabel_top("No. de contratos")
matplotx.line_labels()
plt.xlabel('')
plt.grid(True, axis='y')
sns.despine(bottom=True, left=True)
plt.legend([])
plt.show()
unspsc_segmento = (
unspsc.groupby(["Código Segmento","Nombre Segmento"])
.size().to_frame().reset_index().drop(0,axis=1)
)
contracts["Código Segmento"] = contracts["Codigo Principal de Categoria"].str.slice(start=3, stop=5)
contracts = contracts.merge(unspsc_segmento, how='left', on="Código Segmento")
contracts.drop(columns=['Codigo Principal de Categoria',
'Código Segmento',
'Nit Entidad',
'Estado de Apertura del Proceso'],
inplace=True)
bidders_backup = bidders.copy()
contracts_backup = contracts.copy()
item_min_ratings = 20
filter_items = bidders_backup['ID Procedimiento'].value_counts() > item_min_ratings
filter_items = filter_items[filter_items].index.tolist()
print(len(filter_items))
user_min_ratings = 10
user_max_ratings = 15
filter_users = (
bidders_backup.loc[bidders_backup['ID Procedimiento'].isin(filter_items)]
['NIT Proveedor'].value_counts()
)
filter_users = (filter_users >= user_min_ratings) & (filter_users <= user_max_ratings)
filter_users = filter_users[filter_users].index.tolist()
print(len(filter_users))
filter_valid_mask = (
(bidders_backup['ID Procedimiento'].isin(filter_items)) &
(bidders_backup['NIT Proveedor'].isin(filter_users))
)
bidders = bidders_backup.loc[filter_valid_mask].copy()
filter_items = bidders['ID Procedimiento'].unique().tolist()
valid_items = contracts_backup['ID del Proceso'].isin(filter_items)
contracts = contracts_backup.loc[valid_items].copy()
bid_ids = list(set(filter_items))
bidders_ids = list(set(filter_users))
print(f"{len(bid_ids)} contracts", f"& {len(bidders_ids)} bidders")
contracts.rename(
columns={
'Entidad': 'entity',
'Nit Entidad': 'entity_id',
'Departamento Entidad': 'entity_department',
'ID del Proceso': 'bid_id',
'Nombre del Procedimiento': 'bid_title',
'Fecha de Publicacion del Proceso': 'published_at',
'Precio Base': 'value',
'Modalidad de Contratacion': 'bid_modality',
'Adjudicado': 'is_awarded',
'Nombre del Proveedor Adjudicado': 'contractor',
'NIT del Proveedor Adjudicado': 'contractor_id',
'Codigo Principal de Categoria': 'unspsc',
'Estado de Apertura del Proceso': 'bid_status',
'Tipo de Contrato': 'bid_type',
'Código Segmento': 'unspsc_segment_code',
'Nombre Segmento': 'unspsc_segment',
'Año de Publicacion': 'published_at_year',
},
inplace=True
)
bidders.rename(
columns={
'ID Procedimiento': 'bid_id',
'NIT Entidad': 'entity_id',
'NIT Proveedor': 'contractor_id',
},
inplace=True
)
%%time
def get_rating(row):
awarded_mask = np.where(
(contracts.bid_id == row['bid_id']) &
(contracts.contractor_id == row['contractor_id'])
)[0]
if len(awarded_mask) >= 1:
row['rating'] = 2
else:
row['rating'] = 1
return row
bidders = bidders.apply(get_rating, axis=1)
if True:
contracts.to_csv('/work/data/02_intermediate/contracts.csv', index=False)
bidders.to_csv('/work/data/02_intermediate/bidders.csv', index=False)
def create_rating_map(interactions: pd.DataFrame, ratings: pd.Series) -> dict:
"""Create dictionary of user-items ratings."""
rating_map = {}
for i in range(len(interactions)):
contractor_id = interactions.iloc[i]['contractor_id']
bid_id = interactions.iloc[i]['bid_id']
if contractor_id not in rating_map:
rating_map[contractor_id] = {}
if bid_id not in rating_map[contractor_id]:
rating_map[contractor_id][bid_id] = ratings.iloc[i]
else:
continue
return rating_map
def create_rating_matrix(ratings_map: dict, template_matrix: pd.DataFrame) -> pd.DataFrame:
"""Create two matrix of ratings and probabilities of user-items interactions."""
rating_matrix = template_matrix.copy()
proba_matrix = template_matrix.copy()
for bidder_id in bidders_ids:
for bid_id in bid_ids:
if bidder_id in ratings_map:
if bid_id in ratings_map[bidder_id]:
rating = ratings_map[bidder_id][bid_id]
rating_matrix.loc[bidder_id, bid_id] = rating
proba_matrix.loc[bidder_id, bid_id] = 1
return rating_matrix, proba_matrix
template_array = np.zeros(shape=(len(bidders_ids), len(bid_ids)),
dtype=np.int8)
template_matrix = pd.DataFrame(data=template_array,
columns=bid_ids,
index=bidders_ids,
dtype=np.int8)
print(template_matrix.shape)
%%time
X = bidders[['contractor_id', 'bid_id']].copy()
y = bidders['rating']
rating_map = create_rating_map(X, y)
ratings_matrix, probas_matrix = create_rating_matrix(rating_map, template_matrix)
if True:
ratings_matrix.to_csv('/work/data/03_processed/ratings_all.csv', index=True)
probas_matrix.to_csv('/work/data/03_processed/probas_all.csv', index=True)
%%time
kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
train_indexs_list = []
test_indexs_list = []
train_ratings_list = []
train_probas_list = []
test_ratings_list = []
test_list = []
for fold, (train_index, test_index) in enumerate(kfold.split(X=X, y=y), 1):
print(f'Starting fold {fold}')
train_indexs_list.append(train_index)
test_indexs_list.append(test_index)
X_train = bidders.iloc[train_index][['contractor_id', 'bid_id']].copy()
y_train = bidders.iloc[train_index]['rating']
X_test = bidders.iloc[test_index][['contractor_id', 'bid_id']].copy()
y_test = bidders.iloc[test_index]['rating']
test = bidders.iloc[test_index][['contractor_id', 'bid_id', 'rating']].copy()
print('\tMapping...')
rating_map_train = create_rating_map(X_train, y_train)
rating_map_test = create_rating_map(X_test, y_test)
print('\tFilling matrix...')
rating_matrix_train, proba_matrix_train = create_rating_matrix(
rating_map_train, template_matrix)
rating_matrix_test, proba_matrix_test = create_rating_matrix(
rating_map_test, template_matrix)
print('\tShape:', proba_matrix_train.shape)
train_ratings_list.append(rating_matrix_train)
train_probas_list.append(proba_matrix_train)
test_ratings_list.append(rating_matrix_test)
test_list.append(test)
if True:
with open('/work/data/03_processed/train_indexs_list.pickle', 'wb') as f:
pickle.dump(train_indexs_list, f)
with open('/work/data/03_processed/test_indexs_list.pickle', 'wb') as f:
pickle.dump(test_indexs_list, f)
with open('/work/data/03_processed/train_ratings_list.pickle', 'wb') as f:
pickle.dump(train_ratings_list, f)
with open('/work/data/03_processed/train_probas_list.pickle', 'wb') as f:
pickle.dump(train_probas_list, f)
with open('/work/data/03_processed/test_ratings_list.pickle', 'wb') as f:
pickle.dump(test_ratings_list, f)
with open('/work/data/03_processed/test_list.pickle', 'wb') as f:
pickle.dump(test_list, f)
bidders = pd.read_csv('/work/data/02_intermediate/bidders.csv')
contracts = pd.read_csv('/work/data/02_intermediate/contracts.csv')
ratings_matrix = pd.read_csv('/work/data/03_processed/ratings_all.csv', index_col=0)
probas_matrix = pd.read_csv('/work/data/03_processed/probas_all.csv', index_col=0)
with open('/work/data/03_processed/train_indexs_list.pickle', 'rb') as f:
train_indexs_list = pickle.load(f)
with open('/work/data/03_processed/test_indexs_list.pickle', 'rb') as f:
test_indexs_list = pickle.load(f)
with open('/work/data/03_processed/train_ratings_list.pickle', 'rb') as f:
train_ratings_list = pickle.load(f)
with open('/work/data/03_processed/train_probas_list.pickle', 'rb') as f:
train_probas_list = pickle.load(f)
with open('/work/data/03_processed/test_ratings_list.pickle', 'rb') as f:
test_ratings_list = pickle.load(f)
with open('/work/data/03_processed/test_list.pickle', 'rb') as f:
test_list = pickle.load(f)
%%time
cols_dtypes = {
'ID del Proceso': 'category',
'Nombre del Procedimiento': object
}
all_contracts = pd.read_csv('/datasets/licitana/Data/secop_contratos.csv',
usecols=list(cols_dtypes.keys()),
na_values=['No Definido', -1, 'No Adjudicado'],
low_memory=False,
dtype=cols_dtypes)
all_contracts.columns = ['bid_id', 'bid_title']
all_contracts.bid_id = all_contracts.bid_id.str.replace('CO1.REQ.', '', regex=False).astype(np.int64)
bidder_ids = list(ratings_matrix.index)
bid_ids = list(ratings_matrix.columns)
bid_ids = [int(id.replace('CO1.REQ.', '')) for id in bid_ids]
ratings_matrix.columns = bid_ids
probas_matrix.columns = bid_ids
for df_list in [train_ratings_list, train_probas_list, test_ratings_list]:
for df in df_list:
df.columns = bid_ids
for df in [bidders, contracts] + test_list:
df.bid_id = df.bid_id.str.replace('CO1.REQ.', '', regex=False).astype(np.int64)
bidders = bidders[['contractor_id', 'bid_id', 'rating']]
%%time
precision_test_means = []
recall_test_means = []
auc_test_means = []
factors = [4, 8, 16, 32, 64, 128, 256, 512]
for factor in factors:
print(f'Número de Factores: {factor}')
model = LightFM(loss='warp',
random_state=42,
no_components=factor,
learning_rate=0.05,
user_alpha=0.003,
item_alpha=0.003)
cv_metrics = {
'precision': [],
'recall': [],
'auc': []
}
for fold in tqdm(range(len(train_ratings_list))):
train = csr_matrix(train_ratings_list[fold].values)
test = csr_matrix(test_ratings_list[fold].values)
model.fit(train, epochs=50, verbose=False)
cv_metrics['precision'] = precision_at_k(model, test, train, k=10, check_intersections=False).mean()
cv_metrics['recall'] = recall_at_k(model, test, train, k=10, check_intersections=False).mean()
cv_metrics['auc'] = auc_score(model, test, train, check_intersections=False).mean()
print(' Recall: {:.3f}'.format(np.mean(cv_metrics['recall'])))
print(' Precision: {:.3f}'.format(np.mean(cv_metrics['precision'])))
print(' AUC: {:.3f}'.format(np.mean(cv_metrics['auc'])))
precision_test_means.append(np.mean(cv_metrics['precision']))
recall_test_means.append(np.mean(cv_metrics['recall']))
auc_test_means.append(np.mean(cv_metrics['auc']))
with plt.style.context(matplotx.styles.duftify(matplotx.styles.dracula)):
plt.figure(figsize=(8, 4))
sns.scatterplot(x=factors, y=recall_test_means)
sns.scatterplot(x=[factors[np.argmax(recall_test_means)]],
y=[np.max(recall_test_means)], label='Best one')
plt.text(factors[np.argmax(recall_test_means)] + 7,
np.max(recall_test_means) + 0.002,
f'Mejor factor: {factors[np.argmax(recall_test_means)]}',
fontsize=11)
plt.xlabel('Numero de factores')
plt.title('Recall@k por número de factores')
plt.xlim(-5, 550)
plt.ylim(0.05, 0.35)
matplotx.ylabel_top("Recall@k")
matplotx.line_labels()
sns.despine(bottom=True, left=True)
plt.grid(True, axis='y')
plt.legend([])
plt.show()
no_factors = factors[np.argmax(recall_test_means)]
model = LightFM(loss='warp',
random_state=42,
no_components=no_factors,
learning_rate=0.05,
user_alpha=0.003,
item_alpha=0.003)
data = csr_matrix(ratings_matrix)
model.fit(data, epochs=100, verbose=True)
bidder_ids_mapping = {k: v for k, v in zip(bidder_ids, range(len(bidder_ids)))}
bid_ids_mapping = {k: v for k, v in zip(bid_ids, range(len(bid_ids)))}
def get_bidder_name(bidder_id):
try:
bidder_name = (
contracts.loc[contracts.contractor_id == bidder_id]
.iloc[0].contractor
)
except Exception:
bidder_name = 'Error getting contractor name'
return bidder_name
def get_bid_title(bid_id):
try:
bid_title = (
all_contracts.loc[all_contracts.bid_id == bid_id]
.iloc[0].bid_title
)
except Exception:
bid_title = 'Error getting contract title'
return bid_title
def get_sample_recommendation(model, bidders_ids):
n_bidders, n_bids = ratings_matrix.shape
for bidder_id, bidder_name in bidders_ids:
bidder_ratings = ratings_matrix.loc[bidder_id]
known_positives = list(bidder_ratings.loc[bidder_ratings >= 1].index)
bidder_iid = bidder_ids_mapping[bidder_id]
scores = model.predict(bidder_iid, np.arange(n_bids))
top_items = np.array(bid_ids)[np.argsort(-scores)]
print("\nProveedor: {}".format(bidder_name))
print(" Contratos aplicados:")
for bid_id in known_positives[:5]:
print("\t{} --> {}".format(bid_id, get_bid_title(bid_id)))
print(" Recomendación:")
top_items = [item for item in top_items if item not in known_positives]
for bid_id in top_items[:5]:
print("\t{} --> {}".format(bid_id, get_bid_title(bid_id)))
bidders_segment = pd.merge(
left=bidders, right=contracts[['bid_id', 'unspsc_segment']].drop_duplicates(),
on='bid_id', how='left'
)
largest_contractor_per_segment = (
bidders_segment.groupby(['unspsc_segment', 'contractor_id']).size()
.groupby('unspsc_segment', group_keys=False).nlargest(1).to_frame()
.reset_index().sort_values(by=0, ascending=False).head(7)
)
largest_contractor_per_segment
sample_bidders_ids = [
(900666861, 'PROYECTOS Y TECNOLOGIAS S.A.S'),
(800210669, 'TRANSPORTES GALAXIA S. A.'),
(900471414, 'ORIGIN IT SAS')
]
get_sample_recommendation(model=model, bidders_ids=sample_bidders_ids)