import pandas as pd
consistency_df = pd.read_csv('ntb_2020_consistency.csv')
print(len(consistency_df))
mooc_df = pd.read_csv('ntb_2020_from_mooc.csv')
print(len(mooc_df))
md_stats_df = pd.read_csv('ntb_2020_md_stats.csv')
print(len(md_stats_df))
text_df = pd.read_csv('ntb_2020_text_counts.csv')
print(len(text_df))
versions_df = pd.read_csv('ntb_2020_versions.csv')
print(len(versions_df))
old_data = pd.read_json('2019_imports_4128764_nbs.json')
print(len(old_data))
9941038
9941038
9941038
9941038
9941038
737
consistency_df.head()
mooc_df.head()
md_stats_df.head()
text_df.head()
versions_df.head()
old_data.head()
imports_df = pd.read_csv('ntb_2020_imports.csv').drop(columns=['Unnamed: 0'])
print(len(imports_df))
imports_df.head()
9941038
!curl -gOL "https://github-notebooks-samples.s3-eu-west-1.amazonaws.com/ntbs_list.json"
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 511M 100 511M 0 0 21.9M 0 0:00:23 0:00:23 --:--:-- 22.8M
notebook_prefix = 'https://github-notebooks-update1.s3-eu-west-1.amazonaws.com/'
example_ntb_link = f'{notebook_prefix}0000036466ae1fe8f89eada0a7e55faa1773e7ed.ipynb'
!curl -gOL https://github-notebooks-update1.s3-eu-west-1.amazonaws.com/0000036466ae1fe8f89eada0a7e55faa1773e7ed.ipynb
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 9283 100 9283 0 0 24820 0 --:--:-- --:--:-- --:--:-- 24754
ntblist = pd.read_json('ntbs_list.json')
print(len(ntblist))
ntblist.head()
9941038
# TODO
imports_df2 = imports_df.copy(deep=True)
imports_df2.head()
is_python_36_plus = versions_df['version'].map(lambda x: str(x) in ['python 3.6', 'python 3.7', 'python 3.8'])
imports_df2 = imports_df2[is_python_36_plus]
new_nbs_count = len(imports_df2)
imports_df2.head()
import json
def parse_list_column(s):
try:
s = json.loads(s.replace("'", '"'))
except:
s = []
return s
builtin_imports = list(set(['__future__', '__main__', '_dummy_thread', '_thread', 'abc', 'aifc', 'argparse', 'array', 'ast', 'asynchat', 'asyncio', 'asyncore', 'atexit', 'audioop', 'base64', 'bdb', 'binascii', 'binhex', 'bisect', 'builtins', 'bz2', 'cProfile', 'calendar', 'cgi', 'cgitb', 'chunk', 'cmath', 'cmd', 'code', 'codecs', 'codeop', 'collections', 'collections.abc', 'colorsys', 'compileall', 'concurrent.futures', 'configparser', 'contextlib', 'copy', 'copyreg', 'crypt', 'csv', 'ctypes', 'curses', 'curses.ascii', 'curses.panel', 'curses.textpad', 'datetime', 'dbm', 'dbm.dumb', 'dbm.gnu', 'dbm.ndbm', 'decimal', 'difflib', 'dis', 'distutils', 'distutils.archive_util', 'distutils.bcppcompiler', 'distutils.ccompiler', 'distutils.cmd', 'distutils.command', 'distutils.command.bdist', 'distutils.command.bdist_dumb', 'distutils.command.bdist_msi', 'distutils.command.bdist_packager', 'distutils.command.bdist_rpm', 'distutils.command.bdist_wininst', 'distutils.command.build', 'distutils.command.build_clib', 'distutils.command.build_ext', 'distutils.command.build_py', 'distutils.command.build_scripts', 'distutils.command.check', 'distutils.command.clean', 'distutils.command.config', 'distutils.command.install', 'distutils.command.install_data', 'distutils.command.install_headers', 'distutils.command.install_lib', 'distutils.command.install_scripts', 'distutils.command.register', 'distutils.command.sdist', 'distutils.core', 'distutils.cygwinccompiler', 'distutils.debug', 'distutils.dep_util', 'distutils.dir_util', 'distutils.dist', 'distutils.errors', 'distutils.extension', 'distutils.fancy_getopt', 'distutils.file_util', 'distutils.filelist', 'distutils.log', 'distutils.msvccompiler', 'distutils.spawn', 'distutils.sysconfig', 'distutils.text_file', 'distutils.unixccompiler', 'distutils.util', 'distutils.version', 'doctest', 'dummy_threading', 'email', 'email.charset', 'email.contentmanager', 'email.encoders', 'email.errors', 'email.generator', 'email.header', 'email.headerregistry', 'email.iterators', 'email.message', 'email.mime', 'email.parser', 'email.policy', 'email.utils', 'encodings.idna', 'encodings.mbcs', 'encodings.utf_8_sig', 'ensurepip', 'enum', 'errno', 'faulthandler', 'fcntl', 'filecmp', 'fileinput', 'fnmatch', 'formatter', 'fpectl', 'fractions', 'ftplib', 'functools', 'gc', 'getopt', 'getpass', 'gettext', 'glob', 'grp', 'gzip', 'hashlib', 'heapq', 'hmac', 'html', 'html.entities', 'html.parser', 'http', 'http.client', 'http.cookiejar', 'http.cookies', 'http.server', 'imaplib', 'imghdr', 'imp', 'importlib', 'importlib.abc', 'importlib.machinery', 'importlib.util', 'inspect', 'io', 'ipaddress', 'itertools', 'json', 'json.tool', 'keyword', 'lib2to3', 'linecache', 'locale', 'logging', 'logging.config', 'logging.handlers', 'lzma', 'macpath', 'mailbox', 'mailcap', 'marshal', 'math', 'mimetypes', 'mmap', 'modulefinder', 'msilib', 'msvcrt', 'multiprocessing', 'multiprocessing.connection', 'multiprocessing.dummy', 'multiprocessing.managers', 'multiprocessing.pool', 'multiprocessing.sharedctypes', 'netrc', 'nis', 'nntplib', 'numbers', 'operator', 'optparse', 'os', 'os.path', 'ossaudiodev', 'parser', 'pathlib', 'pdb', 'pickle', 'pickletools', 'pipes', 'pkgutil', 'platform', 'plistlib', 'poplib', 'posix', 'pprint', 'profile', 'pstats', 'pty', 'pwd', 'py_compile', 'pyclbr', 'pydoc', 'queue', 'quopri', 'random', 're', 'readline', 'reprlib', 'resource', 'rlcompleter', 'runpy', 'sched', 'select', 'selectors', 'shelve', 'shlex', 'shutil', 'signal', 'site', 'smtpd', 'smtplib', 'sndhdr', 'socket', 'socketserver', 'spwd', 'sqlite3', 'ssl', 'stat', 'statistics', 'string', 'stringprep', 'struct', 'subprocess', 'sunau', 'symbol', 'symtable', 'sys', 'sysconfig', 'syslog', 'tabnanny', 'tarfile', 'telnetlib', 'tempfile', 'termios', 'test', 'test.support', 'textwrap', 'threading', 'time', 'timeit', 'tkinter', 'tkinter.scrolledtext', 'tkinter.tix', 'tkinter.ttk', 'token', 'tokenize', 'trace', 'traceback', 'tracemalloc', 'tty', 'turtle', 'turtledemo', 'types', 'typing', 'unicodedata', 'unittest', 'unittest.mock', 'urllib', 'urllib.error', 'urllib.parse', 'urllib.request', 'urllib.response', 'urllib.robotparser', 'uu', 'uuid', 'venv', 'warnings', 'wave', 'weakref', 'webbrowser', 'winreg', 'winsound', 'wsgiref', 'wsgiref.handlers', 'wsgiref.headers', 'wsgiref.simple_server', 'wsgiref.util', 'wsgiref.validate', 'xdrlib', 'xml', 'xml.dom', 'xml.dom.minidom', 'xml.dom.pulldom', 'xml.etree.ElementTree', 'xml.parsers.expat', 'xml.parsers.expat.errors', 'xml.parsers.expat.model', 'xml.sax', 'xml.sax.handler', 'xml.sax.saxutils', 'xml.sax.xmlreader', 'xmlrpc.client', 'xmlrpc.server', 'zipapp', 'zipfile', 'zipimport', 'zlib'] + ['AL', 'BaseHTTPServer', 'Bastion', 'CGIHTTPServer', 'Carbon.AE', 'Carbon.AH', 'Carbon.App', 'Carbon.Appearance', 'Carbon.CF', 'Carbon.CG', 'Carbon.CarbonEvents', 'Carbon.CarbonEvt', 'Carbon.Cm', 'Carbon.Components', 'Carbon.ControlAccessor', 'Carbon.Controls', 'Carbon.CoreFounation', 'Carbon.CoreGraphics', 'Carbon.Ctl', 'Carbon.Dialogs', 'Carbon.Dlg', 'Carbon.Drag', 'Carbon.Dragconst', 'Carbon.Events', 'Carbon.Evt', 'Carbon.File', 'Carbon.Files', 'Carbon.Fm', 'Carbon.Folder', 'Carbon.Folders', 'Carbon.Fonts', 'Carbon.Help', 'Carbon.IBCarbon', 'Carbon.IBCarbonRuntime', 'Carbon.Icns', 'Carbon.Icons', 'Carbon.Launch', 'Carbon.LaunchServices', 'Carbon.List', 'Carbon.Lists', 'Carbon.MacHelp', 'Carbon.MediaDescr', 'Carbon.Menu', 'Carbon.Menus', 'Carbon.Mlte', 'Carbon.OSA', 'Carbon.OSAconst', 'Carbon.QDOffscreen', 'Carbon.Qd', 'Carbon.Qdoffs', 'Carbon.Qt', 'Carbon.QuickDraw', 'Carbon.QuickTime', 'Carbon.Res', 'Carbon.Resources', 'Carbon.Scrap', 'Carbon.Snd', 'Carbon.Sound', 'Carbon.TE', 'Carbon.TextEdit', 'Carbon.Win', 'Carbon.Windows', 'ColorPicker', 'ConfigParser', 'Cookie', 'DEVICE', 'DocXMLRPCServer', 'EasyDialogs', 'FL', 'FrameWork', 'GL', 'HTMLParser', 'MacOS', 'MimeWriter', 'MiniAEFrame', 'Nav', 'PixMapWrapper', 'Queue', 'SUNAUDIODEV', 'ScrolledText', 'SimpleHTTPServer', 'SimpleXMLRPCServer', 'SocketServer', 'StringIO', 'Tix', 'Tkinter', 'UserDict', 'UserList', 'UserString', 'W', '__builtin__', '__future__', '__main__', '_winreg', 'abc', 'aepack', 'aetools', 'aetypes', 'aifc', 'al', 'anydbm', 'applesingle', 'argparse', 'array', 'ast', 'asynchat', 'asyncore', 'atexit', 'audioop', 'autoGIL', 'base64', 'bdb', 'binascii', 'binhex', 'bisect', 'bsddb', 'buildtools', 'bz2', 'cPickle', 'cProfile', 'cStringIO', 'calendar', 'cd', 'cfmfile', 'cgi', 'cgitb', 'chunk', 'cmath', 'cmd', 'code', 'codecs', 'codeop', 'collections', 'colorsys', 'commands', 'compileall', 'compiler', 'compiler.ast', 'compiler.visitor', 'contextlib', 'cookielib', 'copy', 'copy_reg', 'crypt', 'csv', 'ctypes', 'curses', 'curses.ascii', 'curses.panel', 'curses.textpad', 'datetime', 'dbhash', 'dbm', 'decimal', 'difflib', 'dircache', 'dis', 'distutils', 'distutils.archive_util', 'distutils.bcppcompiler', 'distutils.ccompiler', 'distutils.cmd', 'distutils.command', 'distutils.command.bdist', 'distutils.command.bdist_dumb', 'distutils.command.bdist_msi', 'distutils.command.bdist_packager', 'distutils.command.bdist_rpm', 'distutils.command.bdist_wininst', 'distutils.command.build', 'distutils.command.build_clib', 'distutils.command.build_ext', 'distutils.command.build_py', 'distutils.command.build_scripts', 'distutils.command.check', 'distutils.command.clean', 'distutils.command.config', 'distutils.command.install', 'distutils.command.install_data', 'distutils.command.install_headers', 'distutils.command.install_lib', 'distutils.command.install_scripts', 'distutils.command.register', 'distutils.command.sdist', 'distutils.core', 'distutils.cygwinccompiler', 'distutils.debug', 'distutils.dep_util', 'distutils.dir_util', 'distutils.dist', 'distutils.emxccompiler', 'distutils.errors', 'distutils.extension', 'distutils.fancy_getopt', 'distutils.file_util', 'distutils.filelist', 'distutils.log', 'distutils.msvccompiler', 'distutils.spawn', 'distutils.sysconfig', 'distutils.text_file', 'distutils.unixccompiler', 'distutils.util', 'distutils.version', 'dl', 'doctest', 'dumbdbm', 'dummy_thread', 'dummy_threading', 'email', 'email.charset', 'email.encoders', 'email.errors', 'email.generator', 'email.header', 'email.iterators', 'email.message', 'email.mime', 'email.parser', 'email.utils', 'encodings.idna', 'encodings.utf_8_sig', 'ensurepip', 'errno', 'exceptions', 'fcntl', 'filecmp', 'fileinput', 'findertools', 'fl', 'flp', 'fm', 'fnmatch', 'formatter', 'fpectl', 'fpformat', 'fractions', 'ftplib', 'functools', 'future_builtins', 'gc', 'gdbm', 'gensuitemodule', 'getopt', 'getpass', 'gettext', 'gl', 'glob', 'grp', 'gzip', 'hashlib', 'heapq', 'hmac', 'hotshot', 'hotshot.stats', 'htmlentitydefs', 'htmllib', 'httplib', 'ic', 'icopen', 'imageop', 'imaplib', 'imgfile', 'imghdr', 'imp', 'importlib', 'imputil', 'inspect', 'io', 'itertools', 'jpeg', 'json', 'keyword', 'lib2to3', 'linecache', 'locale', 'logging', 'logging.config', 'logging.handlers', 'macerrors', 'macostools', 'macpath', 'macresource', 'mailbox', 'mailcap', 'marshal', 'math', 'md5', 'mhlib', 'mimetools', 'mimetypes', 'mimify', 'mmap', 'modulefinder', 'msilib', 'msvcrt', 'multifile', 'multiprocessing', 'multiprocessing.connection', 'multiprocessing.dummy', 'multiprocessing.managers', 'multiprocessing.pool', 'multiprocessing.sharedctypes', 'mutex', 'netrc', 'new', 'nis', 'nntplib', 'numbers', 'operator', 'optparse', 'os', 'os.path', 'ossaudiodev', 'parser', 'pdb', 'pickle', 'pickletools', 'pipes', 'pkgutil', 'platform', 'plistlib', 'popen2', 'poplib', 'posix', 'posixfile', 'pprint', 'profile', 'pstats', 'pty', 'pwd', 'py_compile', 'pyclbr', 'pydoc', 'quopri', 'random', 're', 'readline', 'resource', 'rexec', 'rfc822', 'rlcompleter', 'robotparser', 'runpy', 'sched', 'select', 'sets', 'sgmllib', 'sha', 'shelve', 'shlex', 'shutil', 'signal', 'site', 'smtpd', 'smtplib', 'sndhdr', 'socket', 'spwd', 'sqlite3', 'ssl', 'stat', 'statvfs', 'string', 'stringprep', 'struct', 'subprocess', 'sunau', 'sunaudiodev', 'symbol', 'symtable', 'sys', 'sysconfig', 'syslog', 'tabnanny', 'tarfile', 'telnetlib', 'tempfile', 'termios', 'test', 'test.test_support', 'textwrap', 'thread', 'threading', 'time', 'timeit', 'token', 'tokenize', 'trace', 'traceback', 'ttk', 'tty', 'turtle', 'types', 'unicodedata', 'unittest', 'urllib', 'urllib2', 'urlparse', 'user', 'uu', 'uuid', 'videoreader', 'warnings', 'wave', 'weakref', 'webbrowser', 'whichdb', 'winsound', 'wsgiref', 'wsgiref.handlers', 'wsgiref.headers', 'wsgiref.simple_server', 'wsgiref.util', 'wsgiref.validate', 'xdrlib', 'xml', 'xml.dom', 'xml.dom.minidom', 'xml.dom.pulldom', 'xml.etree.ElementTree', 'xml.parsers.expat', 'xml.sax', 'xml.sax.handler', 'xml.sax.saxutils', 'xml.sax.xmlreader', 'xmlrpclib', 'zipfile', 'zipimport', 'zlib']))
builtin_imports
from collections import defaultdict
imports_df2['imports'] = imports_df2['imports'].apply(parse_list_column) # pandas messes with lists when converting to/from CSV
imports_df2['num_imports'] = imports_df2['imports'].apply(len)
imports_df2
imports_count = defaultdict(int)
for imports in imports_df2['imports']:
imports = set(lib.split('.')[0] for lib in imports)
for lib in imports:
imports_count[lib] = imports_count.get(lib, 0) + 1
sorted_imports_count = sorted(imports_count.items(), key=lambda itm: itm[1], reverse=True)
sorted_imports_count = [x for x in sorted_imports_count if x[1] > 1000]
frequent_imports = list(x[0] for x in sorted_imports_count)
print(f'Got {len(sorted_imports_count)} unique libraries')
Got 607 unique libraries
pd.DataFrame(sorted_imports_count).head(n=25)
plot_libs = [ 'seaborn', 'matplotlib', 'bokeh', 'plotly', 'ggplot', 'pygal', 'geoplotlib', 'gleam', 'missingno', 'leather', 'altair', 'folium', 'livelossplot', 'scikitplot', 'gmplot', 'plotnine' ]
sid = pd.DataFrame(sorted_imports_count)
imports_sid = sid[sid[0].isin(plot_libs)]
imports_sid
from math import pi
#%run bokeh-utils.ipynb
pd.set_option('plotting.backend', 'pandas_bokeh')
bbb = imports_sid.set_index(0)
bar = bbb.plot_bokeh.bar(
ylabel="Count",
xlabel="Library name",
title='Most used plotting libraries',
legend=False,
alpha=0.9)
bar.xaxis.major_label_orientation = pi/4
#bar.y_axis_type = 'log'
#bar.legend = False
bokeh_deepnote_show(bar)
ml_libs = [ 'tensorflow', 'torch', 'keras', 'fastai' ] # sklearn
sid = pd.DataFrame(sorted_imports_count)
sid[sid[0].isin(ml_libs)]
nlp_libs = [ 'nltk', 'gensim', 'polyglot', 'textblob', 'corenlp', 'spacy', 'pattern', 'vocabulary', 'pynlpl', 'quepy']
sid[sid[0].isin(nlp_libs)]
geo_libs = [ 'geopandas', 'shapely', 'rasterio', 'geoviews', 'folium', 'ipyleaflet', 'gdal', 'ogr', 'osr', 'gdalnumeric', 'gdalconst' ]
sid[sid[0].isin(geo_libs)]
chem_libs = [ 'chemlib', 'chemlab', 'chempy', 'pubchempy', 'cirpy', 'chemspipy', 'surfinpy', 'sumo', 'pymoldyn', 'nglview', 'imolecule', 'chemview', 'ase', 'batchcalculator', 'cctbx', 'ionize', 'mendeleev', 'propka', 'pybel', 'pycroscopy', 'pyiron', 'pymatgen', 'symfit', 'symmetry', 'stk', 'ccdc', 'cclib', 'deepchem', 'emmet', 'horton', 'pysic', 'tsase', 'stk' ]
sid[sid[0].isin(chem_libs)]
biomed = [ 'nibabel', 'wfdb', 'heartpy' ]
sid[sid[0].isin(biomed)]
astronomy_libs = [ 'astropy', 'stsci.tools', 'jwql', 'aplpy', 'kapteyn', 'pyfits', 'pywcsgrid2', 'pyregion', 'cosmolopy', 'chianti.core', 'chianti', 'montage_wrapper', 'sunpy', 'aspylib', 'pyspeckit', 'sep', 'fitsio', 'poppy', 'soapy', 'aotools', 'skyfield.api' ]
sid[sid[0].isin(astronomy_libs)]
compression_libs = [ 'gzip', 'zipfile', 'bz2', 'tarfile', 'zlib' ]
sid[sid[0].isin(compression_libs)]
sid