import certifi import socket import subprocess from urllib.error import HTTPError, URLError from urllib.request import urlopen import re import os from datetime import date from datetime import timedelta import requests import csv import json import time def download_data(mode, version, n_max_tries, max_time_per_dl): if mode == 'all': start_year = 1971 end_year = 2024 elif mode == 'nrt': start_year = date.today().strftime('%Y') end_year = (date.today() + timedelta(days=365)).strftime('%Y') version = mode else: print('time mode inapplicable') #iterate through years for year in range(int(start_year), int(end_year)): print(year) link_url = 'https://www.ncei.noaa.gov/data/global-hourly/archive/isd' #catch server connection exceptions read_url = False while read_url == False: try: link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=max_time_per_dl, cafile=certifi.where()).read().decode('utf-8-sig')) read_url = True except HTTPError as error: print('Data not retrieved because %s\nURL: %s'%(error, link_url)) except URLError as error: print('Data not retrieved because %s\nURL: %s'%(error, link_url)) except socket.timeout: print('socket timed out - URL: %s'%(link_url)) #keep only links which end in .csv link_list = ['{}/{}'.format(link_url,lnk) for lnk in link_data if 'isd_{}'.format(year) in lnk] #dir to save files os.makedirs('/esarchive/obs/ghost/NOAA_ISD/original_files/{}/meteo/{}/'.format(version, year), exist_ok=True) specific_directory = '/esarchive/obs/ghost/NOAA_ISD/original_files/{}/meteo/{}/'.format(version, year) #iterates through each link and downloads to required directory #checks if remote file required already exists and needs updating #handles issue of server hanging for 3 minutes spoaradically #try downloading each link a certain number of times before giving up for link in link_list: n_tries = 0 errcode = 999 while (n_tries < n_max_tries) & (errcode != 0): if n_tries == 0: print('Checking/Downloading %s'%(link)) else: print('*** Previous check/download failed. Re-trying for %s'%(link)) cmd = 'wget -N -P %s %s -q -o /dev/null'%(specific_directory,link) process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True) status = process.communicate()[0] errcode = process.returncode if errcode != 0: n_tries+=1 #untar file lnk = link.split('/')[-1] cmd = 'tar -xf {}/{} -C {}'.format(specific_directory,lnk,specific_directory) print('Un-tarring file') process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True) status = process.communicate()[0] errcode = process.returncode #remove isd history cmd = 'rm {}/isd-history*'.format(specific_directory) process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True) status = process.communicate()[0] errcode = process.returncode #remove tar file cmd = 'rm {}/{}'.format(specific_directory,lnk) process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True) status = process.communicate()[0] errcode = process.returncode def download_metadata(n_max_tries, max_time_per_dl): url_metadata = 'https://www.ncei.noaa.gov/pub/data/noaa/isd-history.csv' download_location = "/esarchive/obs/ghost/NOAA_ISD/metadata/network_provided/NOAA_ISD_META_{}.csv" n_tries = 0 errcode = 999 today = date.today() while (n_tries < n_max_tries) and (errcode != 200): r = requests.get(url_metadata, timeout=max_time_per_dl) if r.status_code == 200: with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile: outfile.write(r.content) print('Downloaded metadata') errcode = r.status_code elif r.status_code == 404: print("No metadata found, error 404") errcode = 200 else: # try again print('Response error {}, attempt {}'.format(r.status_code, n_tries)) errcode = r.status_code n_tries += 1 max_time_per_dl = max_time_per_dl*2 # increase waiting time time.sleep(n_tries ** 2) # wait a lil more every time if n_tries == n_max_tries: print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode)) time.sleep(1) """ # create json from original metadata file json_metadata = {} with open('/esarchive/obs/ghost/NOAA_ISD/metadata/network_provided/NOAA_ISD_META.csv', 'r', encoding='utf-8') as file: csv_filedata = csv.DictReader(file) for row in csv_filedata: key = row['USAF'] update_date = today.strftime('%Y-%m-%d') for parameter in row: row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter json_metadata[key] = row with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'w', encoding='utf-8') as f: f.write(json.dumps(json_metadata, indent=4)) """ # create json in desired shape from current metadata file json_metadata_now = {} with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file: csv_filedata = csv.DictReader(file) for row in csv_filedata: key = row['USAF'] update_date = today.strftime('%Y-%m-%d') for parameter in row: row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter json_metadata_now[key] = row # read standardised file to compare! with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'r', encoding='utf-8') as f: json_metadata = json.loads(f.read()) for station in json_metadata: # loop through all the old stations if station in json_metadata_now.keys(): # if station is in current meta data, go on for parameter in json_metadata[station]: if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file # if different value, append the standardised metadeta file print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0])) json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0]) json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0]) else: pass else: print('{} not in new metadata file'.format(parameter)) else: print('Station {} was abolished'.format(station)) for station in json_metadata_now: # loop through all the new stations for parameter in json_metadata_now[station]: # loop through all the parameters if station in json_metadata.keys(): # if station is in old meta data pass # comparison was done before else: # new station appeared! print('New station {}'.format(station)) json_metadata.update({station: json_metadata_now[station]}) # is there a new parameter that wasn't in the old file? if parameter in json_metadata[station].keys(): pass # parameter (column) is already there else: print('{} is new'.format(parameter)) json_metadata[station].update({parameter: json_metadata_now[station][parameter]}) # safe with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'w', encoding='utf-8') as f: f.write(json.dumps(json_metadata, indent=4))