Newer
Older
import certifi
import socket
import subprocess
from urllib.error import HTTPError, URLError
from urllib.request import urlopen
import re
import os
from datetime import date
from datetime import timedelta
import requests
import csv
import json
import time
def download_data(mode, version, n_max_tries, max_time_per_dl):
if mode == 'all':
start_year = 1971
end_year = 2024
elif mode == 'nrt':
start_year = date.today().strftime('%Y')
end_year = (date.today() + timedelta(days=365)).strftime('%Y')
version = mode
else:
print('time mode inapplicable')
#iterate through years
for year in range(int(start_year), int(end_year)):
print(year)
link_url = 'https://www.ncei.noaa.gov/data/global-hourly/archive/isd'
#catch server connection exceptions
read_url = False
while read_url == False:
try:
link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=max_time_per_dl, cafile=certifi.where()).read().decode('utf-8-sig'))
read_url = True
except HTTPError as error:
print('Data not retrieved because %s\nURL: %s'%(error, link_url))
except URLError as error:
print('Data not retrieved because %s\nURL: %s'%(error, link_url))
except socket.timeout:
print('socket timed out - URL: %s'%(link_url))
#keep only links which end in .csv
link_list = ['{}/{}'.format(link_url,lnk) for lnk in link_data if 'isd_{}'.format(year) in lnk]
#dir to save files
os.makedirs('/esarchive/obs/ghost/NOAA_ISD/original_files/{}/meteo/{}/'.format(version, year), exist_ok=True)
specific_directory = '/esarchive/obs/ghost/NOAA_ISD/original_files/{}/meteo/{}/'.format(version, year)
#iterates through each link and downloads to required directory
#checks if remote file required already exists and needs updating
#handles issue of server hanging for 3 minutes spoaradically
#try downloading each link a certain number of times before giving up
for link in link_list:
n_tries = 0
errcode = 999
if n_tries == 0:
print('Checking/Downloading %s'%(link))
else:
print('*** Previous check/download failed. Re-trying for %s'%(link))
cmd = 'wget -N -P %s %s -q -o /dev/null'%(specific_directory,link)
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
status = process.communicate()[0]
errcode = process.returncode
if errcode != 0:
n_tries+=1
#untar file
lnk = link.split('/')[-1]
cmd = 'tar -xf {}/{} -C {}'.format(specific_directory,lnk,specific_directory)
print('Un-tarring file')
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
status = process.communicate()[0]
errcode = process.returncode
#remove isd history
cmd = 'rm {}/isd-history*'.format(specific_directory)
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
status = process.communicate()[0]
errcode = process.returncode
#remove tar file
cmd = 'rm {}/{}'.format(specific_directory,lnk)
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
status = process.communicate()[0]
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
errcode = process.returncode
def download_metadata(n_max_tries, max_time_per_dl):
url_metadata = 'https://www.ncei.noaa.gov/pub/data/noaa/isd-history.csv'
download_location = "/esarchive/obs/ghost/NOAA_ISD/metadata/network_provided/NOAA_ISD_META_{}.csv"
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url_metadata, timeout=max_time_per_dl)
if r.status_code == 200:
with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
"""
# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/network_provided/NOAA_ISD_META.csv', 'r', encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['USAF']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['USAF']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'r', encoding='utf-8') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('{} not in new metadata file'.format(parameter))
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))