Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import certifi
import socket
import subprocess
from urllib.error import HTTPError, URLError
from urllib.request import urlopen
import re
import os
from datetime import date
from datetime import timedelta
def scraper(mode, version):
if mode == 'all':
start_year = 1971
end_year = 2024
elif mode == 'nrt':
start_year = date.today().strftime('%Y')
end_year = (date.today() + timedelta(days=365)).strftime('%Y')
version = mode
else:
print('time mode inapplicable')
#iterate through years
for year in range(int(start_year), int(end_year)):
print(year)
link_url = 'https://www.ncei.noaa.gov/data/global-hourly/archive/isd'
#catch server connection exceptions
read_url = False
while read_url == False:
try:
link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=15, cafile=certifi.where()).read().decode('utf-8-sig'))
read_url = True
except HTTPError as error:
print('Data not retrieved because %s\nURL: %s'%(error, link_url))
except URLError as error:
print('Data not retrieved because %s\nURL: %s'%(error, link_url))
except socket.timeout:
print('socket timed out - URL: %s'%(link_url))
#keep only links which end in .csv
link_list = ['{}/{}'.format(link_url,lnk) for lnk in link_data if 'isd_{}'.format(year) in lnk]
#dir to save files
os.makedirs('/esarchive/obs/ghost/NOAA_ISD/original_files/{}/meteo/{}/'.format(version, year), exist_ok=True)
specific_directory = '/esarchive/obs/ghost/NOAA_ISD/original_files/{}/meteo/{}/'.format(version, year)
#iterates through each link and downloads to required directory
#checks if remote file required already exists and needs updating
#handles issue of server hanging for 3 minutes spoaradically
#try downloading each link a certain number of times before giving up
n_tries_limit = 3
for link in link_list:
n_tries = 0
errcode = 999
while (n_tries < n_tries_limit) & (errcode != 0):
if n_tries == 0:
print('Checking/Downloading %s'%(link))
else:
print('*** Previous check/download failed. Re-trying for %s'%(link))
cmd = 'wget -N -P %s %s -q -o /dev/null'%(specific_directory,link)
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
status = process.communicate()[0]
errcode = process.returncode
if errcode != 0:
n_tries+=1
#untar file
lnk = link.split('/')[-1]
cmd = 'tar -xf {}/{} -C {}'.format(specific_directory,lnk,specific_directory)
print('Un-tarring file')
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
status = process.communicate()[0]
errcode = process.returncode
#remove isd history
cmd = 'rm {}/isd-history*'.format(specific_directory)
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
status = process.communicate()[0]
errcode = process.returncode
#remove tar file
cmd = 'rm {}/{}'.format(specific_directory,lnk)
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
status = process.communicate()[0]
errcode = process.returncode