NOAA_ISD_download.py

import certifi
import socket
import subprocess
from urllib.error import HTTPError, URLError
from urllib.request import urlopen
import re
import os
from datetime import date
from datetime import timedelta
import requests
import csv
import json
import time


def download_data(mode, version, n_max_tries, max_time_per_dl):

    if mode == 'all':
            start_year = 1971
            end_year = 2024

    elif mode == 'nrt':
        start_year = date.today().strftime('%Y')
        end_year = (date.today() + timedelta(days=365)).strftime('%Y')
        version = mode

    else:
        print('time mode inapplicable')

    #iterate through years
    for year in range(int(start_year), int(end_year)):
        print(year)

        link_url = 'https://www.ncei.noaa.gov/data/global-hourly/archive/isd'

        #catch server connection exceptions
        read_url = False
        while read_url == False:
            try:
                link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=max_time_per_dl, cafile=certifi.where()).read().decode('utf-8-sig'))
                read_url = True
            except HTTPError as error:
                print('Data not retrieved because %s\nURL: %s'%(error, link_url))
            except URLError as error:
                print('Data not retrieved because %s\nURL: %s'%(error, link_url))
            except socket.timeout:
                print('socket timed out - URL: %s'%(link_url))

        #keep only links which end in .csv
        link_list = ['{}/{}'.format(link_url,lnk) for lnk in link_data if 'isd_{}'.format(year) in lnk]   

        #dir to save files
        os.makedirs('/esarchive/obs/ghost/NOAA_ISD/original_files/{}/meteo/{}/'.format(version, year), exist_ok=True)
        specific_directory = '/esarchive/obs/ghost/NOAA_ISD/original_files/{}/meteo/{}/'.format(version, year)

        #iterates through each link and downloads to required directory
        #checks if remote file required already exists and needs updating
        #handles issue of server hanging for 3 minutes spoaradically

        #try downloading each link a certain number of times before giving up
        for link in link_list:
            n_tries = 0
            errcode = 999
            while (n_tries < n_max_tries) & (errcode != 0):
                if n_tries == 0:
                    print('Checking/Downloading %s'%(link))
                else:
                    print('*** Previous check/download failed. Re-trying for %s'%(link))
                cmd = 'wget -N -P %s %s -q -o /dev/null'%(specific_directory,link)
                process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
                status = process.communicate()[0]
                errcode = process.returncode
                if errcode != 0:
                    n_tries+=1

            #untar file
            lnk = link.split('/')[-1]
            cmd = 'tar -xf {}/{} -C {}'.format(specific_directory,lnk,specific_directory)
            print('Un-tarring file')
            process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
            status = process.communicate()[0]
            errcode = process.returncode

            #remove isd history 
            cmd = 'rm {}/isd-history*'.format(specific_directory)
            process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
            status = process.communicate()[0]
            errcode = process.returncode

            #remove tar file
            cmd = 'rm {}/{}'.format(specific_directory,lnk)
            process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
            status = process.communicate()[0]
            errcode = process.returncode

def download_metadata(n_max_tries, max_time_per_dl):

    url_metadata = 'https://www.ncei.noaa.gov/pub/data/noaa/isd-history.csv'
    download_location = "/esarchive/obs/ghost/NOAA_ISD/metadata/network_provided/NOAA_ISD_META_{}.csv"

    n_tries = 0
    errcode = 999
    today = date.today()
    
    while (n_tries < n_max_tries) and (errcode != 200):
        r = requests.get(url_metadata, timeout=max_time_per_dl)
        if r.status_code == 200:
            with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
                outfile.write(r.content)
            print('Downloaded metadata')
            errcode = r.status_code
        elif r.status_code == 404:
            print("No metadata found, error 404")
            errcode = 200
        else:
            # try again
            print('Response error {}, attempt {}'.format(r.status_code, n_tries))
            errcode = r.status_code
            n_tries += 1
            max_time_per_dl = max_time_per_dl*2 # increase waiting time 
            time.sleep(n_tries ** 2) # wait a lil more every time 

    if n_tries == n_max_tries:
        print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
    time.sleep(1)
    """
    # create json from original metadata file
    json_metadata = {}
    with open('/esarchive/obs/ghost/NOAA_ISD/metadata/network_provided/NOAA_ISD_META.csv', 'r', encoding='utf-8') as file:
        csv_filedata = csv.DictReader(file)

        for row in csv_filedata:  
            key = row['USAF']
            update_date = today.strftime('%Y-%m-%d')
            for parameter in row:
                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
            json_metadata[key] = row

    with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(json_metadata, indent=4))
        
    """
    
    # create json in desired shape from current metadata file
    json_metadata_now = {}  
    with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
        csv_filedata = csv.DictReader(file)

        for row in csv_filedata:  
            key = row['USAF']
            update_date = today.strftime('%Y-%m-%d')
            for parameter in row:
                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
            json_metadata_now[key] = row

    
    # read standardised file to compare!
    with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'r', encoding='utf-8') as f:
        json_metadata = json.loads(f.read())
        for station in json_metadata: # loop through all the old stations
            if station in json_metadata_now.keys(): # if station is in current meta data, go on
                for parameter in json_metadata[station]:
                    if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
                        if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
                            # if different value, append the standardised metadeta file
                            print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
                            json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
                            json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
                        else:
                            pass
                    else:
                        print('{} not in new metadata file'.format(parameter))
            else:
                print('Station {} was abolished'.format(station))

        for station in json_metadata_now: # loop through all the new stations
            for parameter in json_metadata_now[station]: # loop through all the parameters
                if station in json_metadata.keys(): # if station is in old meta data
                    pass # comparison was done before
                else: # new station appeared!
                    print('New station {}'.format(station))
                    json_metadata.update({station: json_metadata_now[station]})
                # is there a new parameter that wasn't in the old file?
                if parameter in json_metadata[station].keys():
                    pass # parameter (column) is already there
                else:
                    print('{} is new'.format(parameter))
                    json_metadata[station].update({parameter: json_metadata_now[station][parameter]})


    # safe
    with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(json_metadata, indent=4))