import pandas as pd
import numpy as np
import geopandas as gpd
coq_ssi = pd.read_csv('../SII_data/SII_data/Cqbo_SII.csv')
coq_ssi
coq_ssi.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95306 entries, 0 to 95305
Data columns (total 17 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 95306 non-null int64
1 Rut 95306 non-null float64
2 Dv 95306 non-null object
3 Vigencia 95306 non-null object
4 Fecha 95306 non-null object
5 Tipo Direccion 95306 non-null object
6 Calle 95306 non-null object
7 Numero 88241 non-null object
8 Bloque 516 non-null object
9 Departamento 17332 non-null object
10 Villa Poblacion 53498 non-null object
11 Ciudad 78927 non-null object
12 Comuna 95306 non-null object
13 Region 95306 non-null object
14 full_dir 95306 non-null object
15 lat 61565 non-null float64
16 long 61565 non-null float64
dtypes: float64(3), int64(1), object(13)
memory usage: 12.4+ MB
ssi_2005_2009 = pd.read_csv('../SII_data/SII_data/PUB_EMPRESAS_PJ_2005a2009.txt', encoding='latin1', sep='\t')
ssi_2005_2009
ssi_2005_2009.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1270338 entries, 0 to 1270337
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Año comercial 1270338 non-null int64
1 RUT 1270338 non-null int64
2 DV 1270338 non-null object
3 Razón social 1270311 non-null object
4 Tramo según ventas 1270338 non-null int64
5 Número de trabajadores dependientes informados 1270338 non-null int64
6 Fecha inicio de actividades vigente 909282 non-null object
7 Fecha término de giro 362255 non-null object
8 Fecha primera inscripción de actividades 1268914 non-null object
9 Tipo término de giro 362255 non-null object
10 Tipo de contribuyente 1270338 non-null object
11 Subtipo de contribuyente 1270338 non-null object
12 Tramo capital propio positivo 836665 non-null float64
13 Tramo capital propio negativo 137183 non-null float64
14 Rubro económico 1270338 non-null object
15 Subrubro económico 1270338 non-null object
16 Actividad económica 1270338 non-null object
17 Región 1270338 non-null object
18 Provincia 1270338 non-null object
19 Comuna 1270338 non-null object
dtypes: float64(2), int64(4), object(14)
memory usage: 193.8+ MB
ssi_2010_1014 = pd.read_csv('../SII_data/SII_data/PUB_EMPRESAS_PJ_2010a2014_Chile_SSI.txt', encoding='latin1', sep='\t')
ssi_2010_1014
ssi_2010_1014.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1723355 entries, 0 to 1723354
Data columns (total 20 columns):
# Column Dtype
--- ------ -----
0 Año comercial int64
1 RUT int64
2 DV object
3 Razón social object
4 Tramo según ventas int64
5 Número de trabajadores dependientes informados int64
6 Fecha inicio de actividades vigente object
7 Fecha término de giro object
8 Fecha primera inscripción de actividades object
9 Tipo término de giro object
10 Tipo de contribuyente object
11 Subtipo de contribuyente object
12 Tramo capital propio positivo float64
13 Tramo capital propio negativo float64
14 Rubro económico object
15 Subrubro económico object
16 Actividad económica object
17 Región object
18 Provincia object
19 Comuna object
dtypes: float64(2), int64(4), object(14)
memory usage: 263.0+ MB
sii_2015_2019 = pd.read_csv('../SII_data/SII_data/PUB_EMPRESAS_PJ_2015a2019_Chile_SSI.txt', error_bad_lines=False, encoding='latin1', sep='\t')
/var/folders/xg/r0j44_1n5cn5hs8r05xrt57h0000gn/T/ipykernel_10238/2553695165.py:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.
sii_2015_2019 = pd.read_csv('../SII_data/SII_data/PUB_EMPRESAS_PJ_2015a2019_Chile_SSI.txt', error_bad_lines=False, encoding='latin1', sep='\t')
b'Skipping line 49: expected 20 fields, saw 29\nSkipping line 10393: expected 20 fields, saw 35\nSkipping line 24484: expected 20 fields, saw 24\n'
b'Skipping line 36159: expected 20 fields, saw 21\n'
b'Skipping line 71221: expected 20 fields, saw 28\nSkipping line 85461: expected 20 fields, saw 33\nSkipping line 97065: expected 20 fields, saw 22\n'
b'Skipping line 134893: expected 20 fields, saw 21\n'
b'Skipping line 359065: expected 20 fields, saw 30\n'
b'Skipping line 397663: expected 20 fields, saw 28\n'
b'Skipping line 514017: expected 20 fields, saw 34\n'
b'Skipping line 559530: expected 20 fields, saw 24\nSkipping line 585105: expected 20 fields, saw 21\n'
b'Skipping line 601445: expected 20 fields, saw 21\nSkipping line 605166: expected 20 fields, saw 23\nSkipping line 611419: expected 20 fields, saw 31\nSkipping line 620563: expected 20 fields, saw 25\n'
b'Skipping line 627172: expected 20 fields, saw 25\nSkipping line 630587: expected 20 fields, saw 23\nSkipping line 634002: expected 20 fields, saw 35\nSkipping line 654297: expected 20 fields, saw 22\n'
b'Skipping line 672702: expected 20 fields, saw 31\n'
b'Skipping line 902822: expected 20 fields, saw 23\n'
b'Skipping line 1019957: expected 20 fields, saw 32\n'
b'Skipping line 1158958: expected 20 fields, saw 24\n'
b'Skipping line 1234280: expected 20 fields, saw 21\nSkipping line 1237337: expected 20 fields, saw 32\n'
b'Skipping line 1280442: expected 20 fields, saw 34\n'
b'Skipping line 1338503: expected 20 fields, saw 21\nSkipping line 1343059: expected 20 fields, saw 27\n'
b'Skipping line 1354281: expected 20 fields, saw 33\nSkipping line 1359888: expected 20 fields, saw 21\nSkipping line 1360740: expected 20 fields, saw 26\nSkipping line 1370196: expected 20 fields, saw 22\n'
b'Skipping line 1376367: expected 20 fields, saw 27\nSkipping line 1376712: expected 20 fields, saw 21\nSkipping line 1379743: expected 20 fields, saw 24\nSkipping line 1395272: expected 20 fields, saw 22\nSkipping line 1401077: expected 20 fields, saw 21\n'
b'Skipping line 1409739: expected 20 fields, saw 34\nSkipping line 1430782: expected 20 fields, saw 22\nSkipping line 1430966: expected 20 fields, saw 28\n'
b'Skipping line 1453456: expected 20 fields, saw 33\nSkipping line 1459044: expected 20 fields, saw 22\nSkipping line 1471455: expected 20 fields, saw 23\n'
b'Skipping line 1484911: expected 20 fields, saw 21\nSkipping line 1486181: expected 20 fields, saw 36\n'
b'Skipping line 1511636: expected 20 fields, saw 21\nSkipping line 1514568: expected 20 fields, saw 33\nSkipping line 1525224: expected 20 fields, saw 27\nSkipping line 1529302: expected 20 fields, saw 21\nSkipping line 1539966: expected 20 fields, saw 35\n'
b'Skipping line 1544338: expected 20 fields, saw 25\nSkipping line 1565133: expected 20 fields, saw 24\n'
b'Skipping line 1576750: expected 20 fields, saw 24\nSkipping line 1587115: expected 20 fields, saw 23\nSkipping line 1593732: expected 20 fields, saw 26\nSkipping line 1599616: expected 20 fields, saw 23\n'
b'Skipping line 1613353: expected 20 fields, saw 23\nSkipping line 1616040: expected 20 fields, saw 23\nSkipping line 1635196: expected 20 fields, saw 35\n'
b'Skipping line 1651311: expected 20 fields, saw 31\nSkipping line 1653171: expected 20 fields, saw 25\nSkipping line 1653261: expected 20 fields, saw 24\nSkipping line 1668061: expected 20 fields, saw 27\nSkipping line 1671188: expected 20 fields, saw 22\n'
b'Skipping line 1676655: expected 20 fields, saw 24\nSkipping line 1693812: expected 20 fields, saw 21\n'
b'Skipping line 1714199: expected 20 fields, saw 22\nSkipping line 1720035: expected 20 fields, saw 33\n'
b'Skipping line 1737549: expected 20 fields, saw 21\nSkipping line 1744945: expected 20 fields, saw 24\nSkipping line 1768566: expected 20 fields, saw 24\n'
b'Skipping line 1826734: expected 20 fields, saw 22\nSkipping line 1830801: expected 20 fields, saw 27\n'
b'Skipping line 1839110: expected 20 fields, saw 32\nSkipping line 1853946: expected 20 fields, saw 21\n'
b'Skipping line 1869968: expected 20 fields, saw 22\n'
b'Skipping line 1909738: expected 20 fields, saw 27\n'
/var/folders/xg/r0j44_1n5cn5hs8r05xrt57h0000gn/T/ipykernel_10238/2553695165.py:1: DtypeWarning: Columns (1,4,5,12,13) have mixed types. Specify dtype option on import or set low_memory=False.
sii_2015_2019 = pd.read_csv('../SII_data/SII_data/PUB_EMPRESAS_PJ_2015a2019_Chile_SSI.txt', error_bad_lines=False, encoding='latin1', sep='\t')
sii_2015_2019
sii_2015_2019.info()
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2560684 entries, (2015, 53147890, '8', 'SUC REINALDO FLORES MONDACA', 1, 0, nan, '2018-10-17', '1993-01-01 00:00:00', 'TERMINO DE GIRO PERSONA JURIDICA') to (2019, 99599860, '2', 'ORAND SA', 6, 13, '2005-07-01', nan, '2005-07-01 00:00:00', nan)
Data columns (total 10 columns):
# Column Dtype
--- ------ -----
0 Unnamed: 0 object
1 SUCESIONES O COMUNIDADES HERED object
2 4 object
3 Unnamed: 3 object
4 COMERCIO AL POR MAYOR Y AL POR MENOR; REPARACION DE VEHICULOS AUTOMOTORES Y MOTOCICLETAS object
5 VENTA AL POR MENOR DE ALIMENTOS, BEBIDAS Y TABACO EN COMERCIOS ESPECIALIZADOS object
6 VENTA AL POR MENOR DE BEBIDAS ALCOHOLICAS Y NO ALCOHOLICAS EN COMERCIOS ESPECIALIZADOS (BOTILLERIAS) object
7 XIII REGION METROPOLITANA object
8 Santiago object
9 SAN JOAQUIN object
dtypes: object(10)
memory usage: 332.8+ MB
rne = gpd.read_file('../Archivos_Finales_Paper/geodataframes/Directorio_Nacional_de_Empresas_2017/Directorio_Nacional_de_Empresas_2017.shp')
rne
rne.info()
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1001366 entries, 0 to 1001365
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 OBJECTID 1001366 non-null int64
1 REGION 1001366 non-null object
2 NOM_REGION 1001366 non-null object
3 COMUNA 1001366 non-null object
4 NOM_COMUNA 1001366 non-null object
5 SECCION_CI 1001366 non-null object
6 GLOSA_SECC 1001366 non-null object
7 DIVISION_C 1001366 non-null object
8 GLOSA_DIVI 1001366 non-null object
9 geometry 1001366 non-null geometry
dtypes: geometry(1), int64(1), object(8)
memory usage: 76.4+ MB
# REVISAMOS EL SISTEMA DE REFERENCIA DEL ARCHIVO
rne.crs
comunas_chile = gpd.read_file('../SHP_CHILE/Comunas/comunas.shp')
comunas_chile
comunas_chile.info()
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 346 entries, 0 to 345
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 objectid 346 non-null int64
1 shape_leng 346 non-null float64
2 dis_elec 346 non-null int64
3 cir_sena 346 non-null int64
4 cod_comuna 346 non-null int64
5 codregion 346 non-null int64
6 st_area_sh 346 non-null float64
7 st_length_ 346 non-null float64
8 Region 346 non-null object
9 Comuna 346 non-null object
10 Provincia 346 non-null object
11 geometry 346 non-null geometry
dtypes: float64(3), geometry(1), int64(5), object(3)
memory usage: 32.6+ KB
# REVISAMOS EL SISTEMA DE REFERENCIA DEL ARCHIVO
comunas_chile.crs