NOAA_ISD_download.py 3.73 KB
Newer Older
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
import certifi
import socket
import subprocess
from urllib.error import HTTPError, URLError
from urllib.request import urlopen
import re
import os
from datetime import date
from datetime import timedelta




def scraper(mode, version):

    if mode == 'all':
            
            start_year = 1971
            end_year = 2024

    elif mode == 'nrt':

        start_year = date.today().strftime('%Y')
        end_year = (date.today() + timedelta(days=365)).strftime('%Y')

        version = mode

    else:
        print('time mode inapplicable')

    #iterate through years
    for year in range(int(start_year), int(end_year)):
        print(year)

        link_url = 'https://www.ncei.noaa.gov/data/global-hourly/archive/isd'

        #catch server connection exceptions
        read_url = False
        while read_url == False:
            try:
                link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=15, cafile=certifi.where()).read().decode('utf-8-sig'))
                read_url = True
            except HTTPError as error:
                print('Data not retrieved because %s\nURL: %s'%(error, link_url))
            except URLError as error:
                print('Data not retrieved because %s\nURL: %s'%(error, link_url))
            except socket.timeout:
                print('socket timed out - URL: %s'%(link_url))

        #keep only links which end in .csv
        link_list = ['{}/{}'.format(link_url,lnk) for lnk in link_data if 'isd_{}'.format(year) in lnk]   

        #dir to save files
        os.makedirs('/esarchive/obs/ghost/NOAA_ISD/original_files/{}/meteo/{}/'.format(version, year), exist_ok=True)
        specific_directory = '/esarchive/obs/ghost/NOAA_ISD/original_files/{}/meteo/{}/'.format(version, year)

        #iterates through each link and downloads to required directory
        #checks if remote file required already exists and needs updating
        #handles issue of server hanging for 3 minutes spoaradically

        #try downloading each link a certain number of times before giving up
        n_tries_limit = 3
        for link in link_list:
            n_tries = 0
            errcode = 999
            while (n_tries < n_tries_limit) & (errcode != 0):
                if n_tries == 0:
                    print('Checking/Downloading %s'%(link))
                else:
                    print('*** Previous check/download failed. Re-trying for %s'%(link))
                cmd = 'wget -N -P %s %s -q -o /dev/null'%(specific_directory,link)
                process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
                status = process.communicate()[0]
                errcode = process.returncode
                if errcode != 0:
                    n_tries+=1

            #untar file
            lnk = link.split('/')[-1]
            cmd = 'tar -xf {}/{} -C {}'.format(specific_directory,lnk,specific_directory)
            print('Un-tarring file')
            process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
            status = process.communicate()[0]
            errcode = process.returncode

            #remove isd history 
            cmd = 'rm {}/isd-history*'.format(specific_directory)
            process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
            status = process.communicate()[0]
            errcode = process.returncode

            #remove tar file
            cmd = 'rm {}/{}'.format(specific_directory,lnk)
            process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
            status = process.communicate()[0]
            errcode = process.returncode