import urllib
import json
import os
import ftplib
import sys
import datetime
import pandas as pd
from tqdm import tqdm
from pathlib import Path
# contains the mapping from sols to the list of images from the current sol
image_manifest_url = 'http://mars.jpl.nasa.gov/msl-raw-images/image/image_manifest.json'
manifest_data = urllib.request.urlopen(image_manifest_url)
manifest_dict = json.load(manifest_data)
sols = manifest_dict['sols']
sols_available = max([sol['sol'] for sol in sols])
print("Maximum sols available: " + str(sols_available))
Maximum sols available: 3098
# url to the image manifest of a given sol
def sol_manifest_url(sol_num):
return 'http://mars.jpl.nasa.gov/msl-raw-images/image/images_sol' + str(sol_num) + '.json'
# url to the NASA PDS ftp service that contains raw images and labels
pds_node_url = 'pdsimage2.wr.usgs.gov'
def pds_folder_url(sol_num):
return '/archive/MSL/MSLNAV_0XXX/DATA/SOL' + str(sol_num).zfill(5) + '/'
SOL_RANGE = range(0, 10) # sols_available pick small range for a quick run
cameras = ['NAV_LEFT_A', 'NAV_RIGHT_A', 'MAST_LEFT', 'MAST_RIGHT', 'FHAZ_RIGHT_A', 'RHAZ_RIGHT_A', 'FHAZ_LEFT_A', 'RHAZ_LEFT_A', 'FHAZ_RIGHT_B', 'RHAZ_RIGHT_B', 'FHAZ_LEFT_B', 'RHAZ_LEFT_B', 'MARDI', 'CHEMCAM_RMI', 'MAHLI', 'NAV_LEFT_B', 'NAV_RIGHT_B']
images_df = pd.DataFrame(columns = ['sol', 'images'].append(cameras))
for solnum in tqdm(SOL_RANGE):
sol_row_json = {'sol':[solnum], 'images':[0], **{k:[0] for k in cameras}}
try:
sol_data = urllib.request.urlopen(sol_manifest_url(solnum))
except:
# append empty row if sol data unavailable
images_df = images_df.append(pd.DataFrame.from_dict(sol_row_json))
continue
sol_data_json = json.load(sol_data)
for img in sol_data_json['images']:
instrument = img['instrument']
if not img['sampleType'] == 'thumbnail': # do not count thumbnail images
sol_row_json['images'][0] += 1
sol_row_json[instrument][0] += 1
images_df = images_df.append(pd.DataFrame.from_dict(sol_row_json))
100%|██████████| 10/10 [00:01<00:00, 6.49it/s]
raw_images_df = images_df
raw_images_df
Filter sols with zero images
images_df_non_zero = raw_images_df[raw_images_df['images'] > 0]
images_df_non_zero
Sum camera columns for better readability
images_df_summed = images_df_non_zero
pd.options.mode.chained_assignment = None # default='warn'
images_df_summed['NAVCAMS'] = images_df_summed['NAV_LEFT_A'] + images_df_summed['NAV_RIGHT_A'] + images_df_summed['NAV_LEFT_B'] + images_df_summed['NAV_RIGHT_B']
images_df_summed['MASTCAMS'] = images_df_summed['MAST_LEFT'] + images_df_summed['MAST_RIGHT']
images_df_summed['HAZCAMS'] = images_df_summed['FHAZ_RIGHT_A'] + images_df_summed['RHAZ_RIGHT_A'] + images_df_summed['FHAZ_LEFT_A'] + images_df_summed['RHAZ_LEFT_A'] + images_df_summed['FHAZ_RIGHT_B'] + images_df_summed['RHAZ_RIGHT_B'] + images_df_summed['FHAZ_LEFT_B'] + images_df_summed['RHAZ_LEFT_B']
images_df_summed
images_df_sorted = images_df_summed.sort_values(by=['NAV_LEFT_A'], ascending=False)
images_df_sorted
Drop entries with too little or too few navcam images
images_df_final = images_df_sorted[images_df_sorted['NAV_LEFT_A'] >= 2] # I used >= 50 and <= 100 in practice
images_df_final
Download and manually select image sequences suitable for VO
Download scripts
def get_img(img_url):
req = urllib.request.urlopen(img_url)
arr = np.asarray(bytearray(req.read()), dtype=np.uint8)
return cv2.imdecode(arr, -1)
def collect_imgs(out_folder, solnum, instrument, sample_type=None):
try:
sol_data = urllib.request.urlopen(sol_manifest_url(solnum))
except:
print('Can not open url for sol: ' + str(solnum))
return
urls = []
sol_dict = json.load(sol_data)
Path(out_folder).mkdir(parents=True, exist_ok=True)
for img in tqdm(sol_dict['images']):
if (img['instrument'] == instrument) and (img['sampleType'] != 'thumbnail') and (sample_type is None or img['sampleType'] == sampleType):
img_url = img['urlList']
urls.append(img_url)
img_name = img['itemName']
urllib.request.urlretrieve(img_url, out_folder + str(img_name) + ".jpg")
with open(out_folder + "index", 'w') as f:
for item in urls:
f.write("%s\n" % item)
sols_to_collect = list(images_df_final['sol'])
sols_to_collect
dataset_dir = './out/'
for sol in tqdm(sols_to_collect[0:50]):
collect_imgs(dataset_dir + str(sol) + '/', sol, 'NAV_LEFT_A')
def download_labels():
server = 'pdsimage2.wr.usgs.gov'
directory = '/archive/MSL/MSLNAV_0XXX/DATA/SOL'
print("connecting to server")
ftp = ftplib.FTP(server)
ftp.login()
dir_list = os.listdir(dataset_dir)
dir_list.sort(key=lambda x: 0 if x == '.DS_Store' else int(x))
for sol in tqdm(dir_list):
images = os.listdir(dataset_dir + str(sol))
sol_dir = directory + sol.zfill(5) + '/'
ftp.cwd(sol_dir)
# ftp.retrlines('LIST')
for image in tqdm(images):
if not image.endswith('.jpg'):
continue
label_name = image[0:-5] + '1.LBL'
ftp.retrbinary("RETR {}".format(label_name), open(dataset_dir + str(sol) + '/' + label_name, 'wb').write)
ftp.quit()
download_labels()