import certifi import socket import subprocess from urllib.error import HTTPError, URLError from urllib.request import urlopen import re import os from datetime import date from datetime import timedelta def scraper(mode, version): if mode == 'all': start_year = 1971 end_year = 2024 elif mode == 'nrt': start_year = date.today().strftime('%Y') end_year = (date.today() + timedelta(days=365)).strftime('%Y') version = mode else: print('time mode inapplicable') #iterate through years for year in range(int(start_year), int(end_year)): print(year) link_url = 'https://www.ncei.noaa.gov/data/global-hourly/archive/isd' #catch server connection exceptions read_url = False while read_url == False: try: link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=15, cafile=certifi.where()).read().decode('utf-8-sig')) read_url = True except HTTPError as error: print('Data not retrieved because %s\nURL: %s'%(error, link_url)) except URLError as error: print('Data not retrieved because %s\nURL: %s'%(error, link_url)) except socket.timeout: print('socket timed out - URL: %s'%(link_url)) #keep only links which end in .csv link_list = ['{}/{}'.format(link_url,lnk) for lnk in link_data if 'isd_{}'.format(year) in lnk] #dir to save files os.makedirs('/esarchive/obs/ghost/NOAA_ISD/original_files/{}/meteo/{}/'.format(version, year), exist_ok=True) specific_directory = '/esarchive/obs/ghost/NOAA_ISD/original_files/{}/meteo/{}/'.format(version, year) #iterates through each link and downloads to required directory #checks if remote file required already exists and needs updating #handles issue of server hanging for 3 minutes spoaradically #try downloading each link a certain number of times before giving up n_tries_limit = 3 for link in link_list: n_tries = 0 errcode = 999 while (n_tries < n_tries_limit) & (errcode != 0): if n_tries == 0: print('Checking/Downloading %s'%(link)) else: print('*** Previous check/download failed. Re-trying for %s'%(link)) cmd = 'wget -N -P %s %s -q -o /dev/null'%(specific_directory,link) process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True) status = process.communicate()[0] errcode = process.returncode if errcode != 0: n_tries+=1 #untar file lnk = link.split('/')[-1] cmd = 'tar -xf {}/{} -C {}'.format(specific_directory,lnk,specific_directory) print('Un-tarring file') process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True) status = process.communicate()[0] errcode = process.returncode #remove isd history cmd = 'rm {}/isd-history*'.format(specific_directory) process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True) status = process.communicate()[0] errcode = process.returncode #remove tar file cmd = 'rm {}/{}'.format(specific_directory,lnk) process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True) status = process.communicate()[0] errcode = process.returncode