NOAA_ISD_download.py

import certifi
import socket
import subprocess
from urllib.error import HTTPError, URLError
from urllib.request import urlopen
import re
import os
from datetime import date
from datetime import timedelta


def scraper(mode, version):

    if mode == 'all':
            
            start_year = 1971
            end_year = 2024

    elif mode == 'nrt':

        start_year = date.today().strftime('%Y')
        end_year = (date.today() + timedelta(days=365)).strftime('%Y')

        version = mode

    else:
        print('time mode inapplicable')

    #iterate through years
    for year in range(int(start_year), int(end_year)):
        print(year)

        link_url = 'https://www.ncei.noaa.gov/data/global-hourly/archive/isd'

        #catch server connection exceptions
        read_url = False
        while read_url == False:
            try:
                link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=15, cafile=certifi.where()).read().decode('utf-8-sig'))
                read_url = True
            except HTTPError as error:
                print('Data not retrieved because %s\nURL: %s'%(error, link_url))
            except URLError as error:
                print('Data not retrieved because %s\nURL: %s'%(error, link_url))
            except socket.timeout:
                print('socket timed out - URL: %s'%(link_url))

        #keep only links which end in .csv
        link_list = ['{}/{}'.format(link_url,lnk) for lnk in link_data if 'isd_{}'.format(year) in lnk]   

        #dir to save files
        os.makedirs('/esarchive/obs/ghost/NOAA_ISD/original_files/{}/meteo/{}/'.format(version, year), exist_ok=True)
        specific_directory = '/esarchive/obs/ghost/NOAA_ISD/original_files/{}/meteo/{}/'.format(version, year)

        #iterates through each link and downloads to required directory
        #checks if remote file required already exists and needs updating
        #handles issue of server hanging for 3 minutes spoaradically

        #try downloading each link a certain number of times before giving up
        n_tries_limit = 3
        for link in link_list:
            n_tries = 0
            errcode = 999
            while (n_tries < n_tries_limit) & (errcode != 0):
                if n_tries == 0:
                    print('Checking/Downloading %s'%(link))
                else:
                    print('*** Previous check/download failed. Re-trying for %s'%(link))
                cmd = 'wget -N -P %s %s -q -o /dev/null'%(specific_directory,link)
                process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
                status = process.communicate()[0]
                errcode = process.returncode
                if errcode != 0:
                    n_tries+=1

            #untar file
            lnk = link.split('/')[-1]
            cmd = 'tar -xf {}/{} -C {}'.format(specific_directory,lnk,specific_directory)
            print('Un-tarring file')
            process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
            status = process.communicate()[0]
            errcode = process.returncode

            #remove isd history 
            cmd = 'rm {}/isd-history*'.format(specific_directory)
            process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
            status = process.communicate()[0]
            errcode = process.returncode

            #remove tar file
            cmd = 'rm {}/{}'.format(specific_directory,lnk)
            process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
            status = process.communicate()[0]
            errcode = process.returncode