Commits (2)
...@@ -171,20 +171,21 @@ def download_metadata(n_max_tries, max_time_per_dl): ...@@ -171,20 +171,21 @@ def download_metadata(n_max_tries, max_time_per_dl):
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode)) print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1) time.sleep(1)
"""# create json from original metadata file
# create json from original metadata file
json_metadata = {} json_metadata = {}
with open('/esarchive/obs/ghost/CAPMoN/metadata/network_provided/CAPMoN_META.csv', 'r', encoding='ISO-8859-1') as file: with open('/esarchive/obs/ghost/CAPMoN/metadata/network_provided/CAPMoN_META.csv', 'r', encoding='ISO-8859-1') as file:
csv_filedata = csv.DictReader(file) csv_filedata = csv.DictReader(file)
for row in csv_filedata: for row in csv_filedata:
key = row['SiteName_NomDuSite'] key = row['ID']+'_'+row['Measurements_Mesures'].replace('"', '')[:3]
update_date = today.strftime('%Y-%m-%d') update_date = today.strftime('%Y-%m-%d')
for parameter in row: for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row json_metadata[key] = row
with open('/esarchive/obs/ghost/CAPMoN/metadata/processed/CAPMoN_META.json', 'w', encoding='utf-8') as f: with open('/esarchive/obs/ghost/CAPMoN/metadata/processed/CAPMoN_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))""" f.write(json.dumps(json_metadata, indent=4))
# create json in desired shape from current metadata file # create json in desired shape from current metadata file
...@@ -193,7 +194,7 @@ def download_metadata(n_max_tries, max_time_per_dl): ...@@ -193,7 +194,7 @@ def download_metadata(n_max_tries, max_time_per_dl):
csv_filedata = csv.DictReader(file) csv_filedata = csv.DictReader(file)
for row in csv_filedata: for row in csv_filedata:
key = row['SiteName_NomDuSite'] key = row['ID']+'_'+row['Measurements_Mesures'].replace('"', '')[:3]
update_date = today.strftime('%Y-%m-%d') update_date = today.strftime('%Y-%m-%d')
for parameter in row: for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
...@@ -218,11 +219,18 @@ def download_metadata(n_max_tries, max_time_per_dl): ...@@ -218,11 +219,18 @@ def download_metadata(n_max_tries, max_time_per_dl):
print('Station {} was abolished'.format(station)) print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations for station in json_metadata_now: # loop through all the new stations
if station in json_metadata.keys(): # if station is in old meta data for parameter in json_metadata_now[station]: # loop through all the parameters
pass # comparison was done before if station in json_metadata.keys(): # if station is in old meta data
else: # new station appeared! pass # comparison was done before
print('New station {}'.format(station)) else: # new station appeared!
json_metadata.update({station: json_metadata_now[station]}) print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe # safe
......
This diff is collapsed.
...@@ -15,14 +15,30 @@ import zipfile ...@@ -15,14 +15,30 @@ import zipfile
import os.path import os.path
import os import os
import pandas as pd
def scraper(mode, version): def download_data(mode, version, n_max_tries, max_time_per_dl):
url = 'https://monitoring.eanet.asia/document/menu/index#publicData' url = 'https://monitoring.eanet.asia/document/menu/index#publicData'
download_location = "/esarchive/obs/ghost/EANET/original_files/{}/".format(version)
today = date.today().strftime('%Y%m%d') #+ timedelta(days = 1)).strftime('%Y%m%d') # problem with timezones??? today = date.today().strftime('%Y%m%d') #+ timedelta(days = 1)).strftime('%Y%m%d') # problem with timezones???
#print(today)
if mode == 'all':
bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts
edate = date.today()
os.makedirs('/esarchive/obs/ghost/EANET/original_files/{}/'.format(version), exist_ok=True)
download_location = "/esarchive/obs/ghost/EANET/original_files/{}/".format(version)
elif mode == 'nrt':
print("EANET no nrt")
exit()
else:
print('time mode inapplicable')
exit()
options = Options() options = Options()
prefs = {'download.default_directory' : download_location} prefs = {'download.default_directory' : download_location}
...@@ -32,17 +48,17 @@ def scraper(mode, version): ...@@ -32,17 +48,17 @@ def scraper(mode, version):
driver = webdriver.Chrome(service=svc, options=options) driver = webdriver.Chrome(service=svc, options=options)
driver.get(url) driver.get(url)
time.sleep(2) time.sleep(max_time_per_dl)
# login # login
email = driver.find_element(By.ID, "email") email = driver.find_element(By.ID, "email")
email.send_keys("raphael.grodofzig@bsc.es") email.send_keys("raphael.grodofzig@bsc.es")
passwd = driver.find_element(By.ID, "passwd") passwd = driver.find_element(By.ID, "passwd")
passwd.send_keys("274s9QZ5") passwd.send_keys("274s9QZ5")
time.sleep(2) time.sleep(max_time_per_dl)
driver.find_element(By.NAME, "submitBtn").click() driver.find_element(By.NAME, "submitBtn").click()
time.sleep(3) time.sleep(max_time_per_dl)
# find countries # find countries
dropdown_element = driver.find_element(By.ID, 'countryCd') dropdown_element = driver.find_element(By.ID, 'countryCd')
...@@ -104,4 +120,109 @@ def scraper(mode, version): ...@@ -104,4 +120,109 @@ def scraper(mode, version):
i=i+1 i=i+1
driver.close() driver.close()
\ No newline at end of file
def download_metadata(n_max_tries, max_time_per_dl):
url_metadata = 'https://www.eanet.asia/wp-content/uploads/2024/01/Site_Information_Acid_Deposition_Monitoring_NMP2023_1117.xlsm'
download_location = "/esarchive/obs/ghost/EANET/metadata/network_provided/EANET_META_{}.csv"
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url_metadata, timeout=max_time_per_dl, headers=Headers)
if r.status_code == 200:
with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
metadata = pd.read_excel(download_location.format(today.strftime('%Y%m%d')), engine='pyxlsb').fillna('')
print(metadata)
"""
# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/EANET/metadata/network_provided/EANET_META.csv', 'r', encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'r', encoding='ISO-8859-1') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('{} not in new metadata file'.format(parameter))
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))"""
\ No newline at end of file
import requests
import time
from datetime import date
from datetime import timedelta
import zipfile
import urllib
import os.path
import os
import pandas as pd
def scraper(mode):
url = 'https://monitoring.eanet.asia/document/menu/index#publicData'
download_url = 'https://www.eanet.asia/wp-content/uploads/2024/01/Site_Information_Acid_Deposition_Monitoring_NMP2023_1117.xlsm'
download_location = "/esarchive/obs/ghost/EANET/metadata/network_provided/"
today = date.today().strftime('%Y%m%d') #+ timedelta(days = 1)).strftime('%Y%m%d') # problem with timezones???
"""
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
r = requests.get(download_url, timeout=120, headers=Headers)
print(r.status_code)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+"downloaded_metadata.xlsm")
print('Downloaded metadata')
else:
print('url status not ok')"""
# open file
metadata = pd.read_excel(download_location+"downloaded_metadata.xlsm", engine='pyxlsb').fillna('')
print(metadata)
...@@ -8,20 +8,20 @@ import urllib ...@@ -8,20 +8,20 @@ import urllib
import tarfile import tarfile
import shutil import shutil
import gzip import gzip
import csv
import json
def scraper(mode, version): def download_data(mode, version, n_max_tries, max_time_per_dl):
base_url = 'http://www.aire.cdmx.gob.mx/opendata/anuales_horarios_gz/contaminantes_{}.csv.gz' base_url = 'http://www.aire.cdmx.gob.mx/opendata/anuales_horarios_gz/contaminantes_{}.csv.gz'
if mode == 'all': if mode == 'all':
bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts
edate = date.today() edate = date.today()
os.makedirs('/esarchive/obs/ghost/MEXICO_CDMX/original_files/{}/'.format(version), exist_ok=True) os.makedirs('/esarchive/obs/ghost/MEXICO_CDMX/original_files/{}/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/MEXICO_CDMX/original_files/'+version+'/contaminantes_{}.csv.gz' download_location = '/esarchive/obs/ghost/MEXICO_CDMX/original_files/'+version+'/contaminantes_{}.csv.gz'
elif mode == 'nrt': elif mode == 'nrt':
...@@ -39,21 +39,143 @@ def scraper(mode, version): ...@@ -39,21 +39,143 @@ def scraper(mode, version):
# download # download
for year in years: for year in years:
url = base_url.format(year) url = base_url.format(year)
r = requests.get(url, timeout=120) n_tries = 0
if r.status_code == 200: errcode = 999
urllib.request.urlretrieve(url, download_location.format(year))
print('Downloaded {}'.format(url)) while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url, timeout=max_time_per_dl)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location.format(year))
print('Downloaded {}'.format(url))
# unzip
with gzip.open(download_location.format(year), 'rb') as f_in:
with open(download_location.format(year)[:-3], 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
# remove files
os.remove(download_location.format(year))
errcode = r.status_code
elif r.status_code == 404:
print("No data found, error 404, year {}".format(year))
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(year))
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# unzip def download_metadata(n_max_tries, max_time_per_dl):
with gzip.open(download_location.format(year), 'rb') as f_in:
with open(download_location.format(year)[:-3], 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
# remove files url_metadata = 'http://www.aire.cdmx.gob.mx/opendata/catalogos/cat_estacion.csv'
os.remove(download_location.format(year)) download_location = "/esarchive/obs/ghost/MEXICO_CDMX/metadata/network_provided/MEXICO_CDMX_META_{}.csv"
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url_metadata, timeout=max_time_per_dl)
if r.status_code == 200:
with open(download_location.format('_unformatted'), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else: else:
print('No {}'.format(url)) # try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
time.sleep(1) errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# import it as pandas to clean header
meta_file = pd.read_csv(download_location.format('_unformatted'), header=[1], encoding='ISO-8859-1')
meta_file.to_csv(download_location.format(today.strftime('%Y%m%d')), index=False)
os.remove(download_location.format('_unformatted'))
# create json from original metadata file
"""json_metadata = {}
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/network_provided/MEXICO_CDMX_META.csv', 'r', encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/processed/MEXICO_CDMX_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/processed/MEXICO_CDMX_META.json', 'r', encoding='ISO-8859-1') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('{} not in new metadata file'.format(parameter))
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/processed/MEXICO_CDMX_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
\ No newline at end of file
...@@ -22,22 +22,18 @@ from selenium.webdriver.support import expected_conditions as EC ...@@ -22,22 +22,18 @@ from selenium.webdriver.support import expected_conditions as EC
def scraper(mode, version): def download_data(mode, version, n_max_tries, max_time_per_dl):
baseurl = 'https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos-2001-2021.html' baseurl = 'https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos-2001-2021.html'
if mode == 'all': if mode == 'all':
bdate = date(2001, 1, 1) #date(1960, 1, 1) # date before record starts bdate = date(2001, 1, 1) #date(1960, 1, 1) # date before record starts
edate = date.today() edate = date.today()
os.makedirs('/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version), exist_ok=True) os.makedirs('/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version) download_location = '/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version)
elif mode == 'nrt': elif mode == 'nrt':
bdate = date(2024, 3, 2) #date.today() - timedelta(days = 1) # if code is run after 2 am, data from previous day will be available print("nrt not available")
edate = date(2024, 3, 3) #date.today() - timedelta(days = 1)
download_location = '/esarchive/obs/ghost/MITECO/original_files/nrt/' download_location = '/esarchive/obs/ghost/MITECO/original_files/nrt/'
else: else:
...@@ -70,23 +66,40 @@ def scraper(mode, version): ...@@ -70,23 +66,40 @@ def scraper(mode, version):
for zip_link in zip_links: for zip_link in zip_links:
filename = zip_link.get("href").rpartition('/')[-1] filename = zip_link.get("href").rpartition('/')[-1]
url = 'https://www.miteco.gob.es/{}'.format(zip_link.get("href")) url = 'https://www.miteco.gob.es/{}'.format(zip_link.get("href"))
n_tries = 0
errcode = 999
r = requests.get(url, timeout=120) while (n_tries < n_max_tries) and (errcode != 200):
if r.status_code == 200: r = requests.get(url, timeout=max_time_per_dl)
urllib.request.urlretrieve(url, download_location+filename) if r.status_code == 200:
print('Downloaded {}'.format(filename)) urllib.request.urlretrieve(url, download_location+filename)
print('Downloaded {}'.format(filename))
# unzip
with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
zip_ref.extractall(download_location)
os.remove(download_location+filename)
# unzip errcode = r.status_code
with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
zip_ref.extractall(download_location) elif r.status_code == 404:
print("No data found, error 404")
os.remove(download_location+filename) errcode = 200
else:
print('No {}'.format(url))
elif r.status_code == 403:
print("Permission denied for {}".format(url))
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1) time.sleep(1)
# go to hyperlinks # go to hyperlinks
...@@ -118,21 +131,41 @@ def scraper(mode, version): ...@@ -118,21 +131,41 @@ def scraper(mode, version):
os.remove(zip_file) os.remove(zip_file)
continue continue
n_tries = 0
errcode = 999
r = requests.get(url, timeout=120) while (n_tries < n_max_tries) and (errcode != 200):
if r.status_code == 200: r = requests.get(url, timeout=max_time_per_dl)
urllib.request.urlretrieve(url, download_location+filename) if r.status_code == 200:
print('Downloaded {}'.format(filename)) urllib.request.urlretrieve(url, download_location+filename)
print('Downloaded {}'.format(filename))
# unzip
with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
zip_ref.extractall(download_location)
os.remove(download_location+filename)
else:
print('No {}'.format(url))
# unzip
with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
zip_ref.extractall(download_location)
os.remove(download_location+filename)
errcode = r.status_code
elif r.status_code == 404:
print("No data found, error 404")
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(url))
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1) time.sleep(1)
# delete metadata # delete metadata
...@@ -152,3 +185,96 @@ def scraper(mode, version): ...@@ -152,3 +185,96 @@ def scraper(mode, version):
driver.close() driver.close()
def download_metadata(n_max_tries, max_time_per_dl):
url_metadata = 'https://www.miteco.gob.es/content/dam/miteco/es/calidad-y-evaluacion-ambiental/sgalsi/atm%C3%B3sfera-y-calidad-del-aire/evaluaci%C3%B3n-2022/Metainformacion2022.xlsx'
download_location = "/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META_{}.xlsx"
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
r = requests.get(url_metadata, headers=Headers, timeout=max_time_per_dl)
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
if r.status_code == 200:
with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# convert to csv
file = pd.read_excel(download_location.format(today.strftime('%Y%m%d')))
file.to_csv('/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META_{}.csv'.format(today.strftime('%Y%m%d')), index=False, header=True)
"""# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META.csv', 'r', encoding='ISO-8859-1') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['SiteName_NomDuSite']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='ISO-8859-1') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['SiteName_NomDuSite']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'r', encoding='ISO-8859-1') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1]), json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# safe
with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'w', encoding='ISO-8859-1') as f:
f.write(json.dumps(json_metadata, indent=4))"""
...@@ -7,20 +7,21 @@ import re ...@@ -7,20 +7,21 @@ import re
import os import os
from datetime import date from datetime import date
from datetime import timedelta from datetime import timedelta
import requests
import csv
import json
import time
def scraper(mode, version): def download_data(mode, version, n_max_tries, max_time_per_dl):
if mode == 'all': if mode == 'all':
start_year = 1971 start_year = 1971
end_year = 2024 end_year = 2024
elif mode == 'nrt': elif mode == 'nrt':
start_year = date.today().strftime('%Y') start_year = date.today().strftime('%Y')
end_year = (date.today() + timedelta(days=365)).strftime('%Y') end_year = (date.today() + timedelta(days=365)).strftime('%Y')
version = mode version = mode
else: else:
...@@ -36,7 +37,7 @@ def scraper(mode, version): ...@@ -36,7 +37,7 @@ def scraper(mode, version):
read_url = False read_url = False
while read_url == False: while read_url == False:
try: try:
link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=15, cafile=certifi.where()).read().decode('utf-8-sig')) link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=max_time_per_dl, cafile=certifi.where()).read().decode('utf-8-sig'))
read_url = True read_url = True
except HTTPError as error: except HTTPError as error:
print('Data not retrieved because %s\nURL: %s'%(error, link_url)) print('Data not retrieved because %s\nURL: %s'%(error, link_url))
...@@ -57,11 +58,10 @@ def scraper(mode, version): ...@@ -57,11 +58,10 @@ def scraper(mode, version):
#handles issue of server hanging for 3 minutes spoaradically #handles issue of server hanging for 3 minutes spoaradically
#try downloading each link a certain number of times before giving up #try downloading each link a certain number of times before giving up
n_tries_limit = 3
for link in link_list: for link in link_list:
n_tries = 0 n_tries = 0
errcode = 999 errcode = 999
while (n_tries < n_tries_limit) & (errcode != 0): while (n_tries < n_max_tries) & (errcode != 0):
if n_tries == 0: if n_tries == 0:
print('Checking/Downloading %s'%(link)) print('Checking/Downloading %s'%(link))
else: else:
...@@ -91,4 +91,103 @@ def scraper(mode, version): ...@@ -91,4 +91,103 @@ def scraper(mode, version):
cmd = 'rm {}/{}'.format(specific_directory,lnk) cmd = 'rm {}/{}'.format(specific_directory,lnk)
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True) process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
status = process.communicate()[0] status = process.communicate()[0]
errcode = process.returncode errcode = process.returncode
\ No newline at end of file
def download_metadata(n_max_tries, max_time_per_dl):
url_metadata = 'https://www.ncei.noaa.gov/pub/data/noaa/isd-history.csv'
download_location = "/esarchive/obs/ghost/NOAA_ISD/metadata/network_provided/NOAA_ISD_META_{}.csv"
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url_metadata, timeout=max_time_per_dl)
if r.status_code == 200:
with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
"""
# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/network_provided/NOAA_ISD_META.csv', 'r', encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['USAF']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['USAF']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'r', encoding='utf-8') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('{} not in new metadata file'.format(parameter))
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
\ No newline at end of file
...@@ -47,15 +47,14 @@ def download_data(mode, version, n_max_tries, max_time_per_dl): ...@@ -47,15 +47,14 @@ def download_data(mode, version, n_max_tries, max_time_per_dl):
options.add_argument("--no-sandbox") options.add_argument("--no-sandbox")
#options.add_argument("--headless") #options.add_argument("--headless")
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
n_tries = 0 n_tries = 0
errcode = 999 errcode = 999
while (n_tries < n_max_tries) and (errcode != 200): while (n_tries < n_max_tries) and (errcode != 200):
try: try:
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
# open url # open url
driver.get(baseurl) driver.get(baseurl)
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded
...@@ -105,14 +104,11 @@ def download_data(mode, version, n_max_tries, max_time_per_dl): ...@@ -105,14 +104,11 @@ def download_data(mode, version, n_max_tries, max_time_per_dl):
print("Number of tries: {}".format(n_tries)) print("Number of tries: {}".format(n_tries))
continue continue
driver.close() driver.close()
if n_tries == n_max_tries: if n_tries == n_max_tries:
print('Failed downloading US_NADP_AMNet data {} times in {} seconds'.format(n_tries, max_time_per_dl)) print('Failed downloading US_NADP_AMNet data {} times in {} seconds'.format(n_tries, max_time_per_dl))
print(os.path.split(download_location[:-5]))
os.rename("{}AMNET-ALL-h.csv".format(download_location), "/esarchive/obs/ghost/US_NADP_AMNet/original_files/{}/AMNET-ALL-h.csv".format(version)) os.rename("{}AMNET-ALL-h.csv".format(download_location), "/esarchive/obs/ghost/US_NADP_AMNet/original_files/{}/AMNET-ALL-h.csv".format(version))
...@@ -229,7 +225,7 @@ def download_metadata(n_max_tries, max_time_per_dl): ...@@ -229,7 +225,7 @@ def download_metadata(n_max_tries, max_time_per_dl):
for parameter in json_metadata[station]: for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file # if different value, append the standardised metadeta file
print("new {} --- old {}".format(json_metadata_now[station][parameter]['values'][0], json_metadata[station][parameter]['values'][-1])) print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0]) json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0]) json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else: else:
......
...@@ -8,9 +8,9 @@ import pandas ...@@ -8,9 +8,9 @@ import pandas
import os.path import os.path
import urllib import urllib
import time import time
import ssl
import zipfile import zipfile
from compare_two_files import compare_files import json
import csv
from chromedriver_py import binary_path from chromedriver_py import binary_path
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
...@@ -22,22 +22,16 @@ from selenium.webdriver.support import expected_conditions as EC ...@@ -22,22 +22,16 @@ from selenium.webdriver.support import expected_conditions as EC
def scraper(mode, version): def download_data(mode, version, n_max_tries, max_time_per_dl):
baseurl = 'https://nadp.slh.wisc.edu/networks/ammonia-monitoring-network/' baseurl = 'https://nadp.slh.wisc.edu/networks/ammonia-monitoring-network/'
if mode == 'all': if mode == 'all':
bdate = date(2013, 12, 1) #date(1960, 1, 1) # date before record starts os.makedirs('/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/temp/'.format(version), exist_ok=True)
edate = date(2024, 1, 1) #date.today() - timedelta(days = 1) download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/temp/'.format(version)
os.makedirs('/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/'.format(version)
elif mode == 'nrt': elif mode == 'nrt':
bdate = date(2024, 1, 1) #date.today() - timedelta(days = 1) download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/nrt/temp/'
edate = date.today() - timedelta(days = 1)
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/nrt/'
else: else:
print('time mode inapplicable') print('time mode inapplicable')
...@@ -52,38 +46,198 @@ def scraper(mode, version): ...@@ -52,38 +46,198 @@ def scraper(mode, version):
svc = webdriver.ChromeService(executable_path=binary_path) svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options) driver = webdriver.Chrome(service=svc, options=options)
n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
try:
# open url
driver.get(baseurl)
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded
dropdown_element = driver.find_element(By.ID, 'data-type')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
select.select_by_visible_text("Bi-weekly")
time.sleep(max_time_per_dl)
dropdown_element = driver.find_element(By.ID, 'sites-list')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
select.select_by_visible_text("All Sites")
time.sleep(max_time_per_dl)
invalid_box = driver.find_element(By.ID, 'invalid')
invalid_box.click()
time.sleep(max_time_per_dl)
# download
driver.find_element(By.ID, 'generate-button-text').click()
# wait until download finished
while not os.path.exists("{}AMoN-ALL-W-i.csv".format(download_location)):
time.sleep(1)
if os.path.isfile("{}AMoN-ALL-W-i.csv".format(download_location)):
print('AMoN-ALL-W-i.csv download successful')
errcode = 200
continue
except TimeoutException as e:
print(e)
max_time_per_dl = max_time_per_dl*2 # set waiting time to double
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
except:
print("Unknown error")
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
driver.close()
if n_tries == n_max_tries:
print('Failed downloading US_NADP_AMoN data {} times in {} seconds'.format(n_tries, max_time_per_dl))
os.rename("{}AMoN-ALL-W-i.csv".format(download_location), "/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/AMoN-ALL-W-i.csv".format(version))
def download_metadata(n_max_tries, max_time_per_dl):
# open url baseurl = 'https://nadp.slh.wisc.edu/networks/ammonia-monitoring-network/'
driver.get(baseurl) #os.makedirs('/esarchive/obs/ghost/US_NADP_AMNet/metadata/network_provided/', exist_ok=True)
WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded download_location = '/esarchive/obs/ghost/US_NADP_AMoN/metadata/network_provided/'
today = date.today()
dropdown_element = driver.find_element(By.ID, 'data-type')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
print(options)
select.select_by_visible_text("Bi-weekly")
time.sleep(3)
dropdown_element = driver.find_element(By.ID, 'sites-list')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
print(options)
select.select_by_visible_text("All Sites")
time.sleep(3)
invalid_box = driver.find_element(By.ID, 'invalid')
invalid_box.click()
time.sleep(3)
# download
driver.find_element(By.ID, 'generate-button-text').click()
# wait until download finished # set up driver
while not os.path.exists("{}AMoN-ALL-W-i.csv".format(download_location)): options = Options()
time.sleep(1) prefs = {'download.default_directory' : download_location}
options.add_experimental_option('prefs', prefs)
options.add_argument("--no-sandbox")
options.add_argument("--headless")
if os.path.isfile("{}AMoN-ALL-W-i.csv".format(download_location)): n_tries = 0
print('AMoN-ALL-W-i.csv download successful') errcode = 999
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
driver.maximize_window()
while (n_tries < n_max_tries) and (errcode != 200):
try:
# open url
driver.get(baseurl)
#WebDriverWait(driver, max_time_per_dl).until(EC.element_to_be_clickable((By.ID, 'invalid'))) # wait till loaded
time.sleep(max_time_per_dl)
invalid_box = driver.find_element(By.ID, 'download-show-inactive')
driver.execute_script("arguments[0].click()", invalid_box)
# download
#WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'generate-button-text'))) # wait till loaded
time.sleep(max_time_per_dl)
bttn = driver.find_element(By.ID, 'network-data-submit')
driver.execute_script("arguments[0].click()", bttn)
# wait until download finished
while not os.path.exists(download_location+'amon.csv'):
time.sleep(1)
if os.path.isfile(download_location+'amon.csv'):
print('Amon metadata download successful')
errcode = 200
continue
except TimeoutException as e:
print(e)
max_time_per_dl = max_time_per_dl*2 # set waiting time to double
n_tries = n_tries+1
continue
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
except:
print("Unknown error")
max_time_per_dl = max_time_per_dl*2 # set waiting time to double
n_tries = n_tries+1
continue
if n_tries == n_max_tries:
print('Failed downloading AMoN metadata {} times in {} seconds'.format(n_tries, max_time_per_dl))
driver.close()
os.rename(download_location+'amon.csv', download_location+'US_NADP_AMoN_META_{}.csv'.format(today.strftime('%Y%m%d')))
# create json from original metadata file =====================================================================================
"""json_metadata = {}
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/network_provided/US_NADP_AMoN_META.csv', 'r') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['siteId']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/processed/US_NADP_AMoN_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location+'US_NADP_AMoN_META_{}.csv'.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['siteId']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/processed/US_NADP_AMoN_META.json', 'r', encoding='utf-8') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# safe
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/processed/US_NADP_AMoN_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
driver.close()
\ No newline at end of file
import hashlib # works for all type of data import hashlib # works for all type of data
import requests
def request_download(url, max_time_per_dl, download_location):
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
n_tries = 0
errcode = 999
r = requests.get(url, headers=Headers, timeout=max_time_per_dl)
while (n_tries < n_max_tries) and (errcode != 200):
if r.status_code == 200:
open(download_location + file.format(year), "wb").write(r.content)
print('Downloaded {}'.format(file.format(year)))
errcode = r.status_code
elif r.status_code == 404:
print("No ozone l data found, error 404")
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(file.format(year)))
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# check if files are different # check if files are different
......
...@@ -61,7 +61,13 @@ if __name__ == "__main__": ...@@ -61,7 +61,13 @@ if __name__ == "__main__":
'CNEMC': {'max_time_dl': 3}, 'CNEMC': {'max_time_dl': 3},
'CANADA_NAPS': {'max_time_dl': 5}, 'CANADA_NAPS': {'max_time_dl': 5},
'CAPMoN': {'max_time_dl': 5}, 'CAPMoN': {'max_time_dl': 5},
'US_NADP_AMNet': {'max_time_dl': 10}} 'US_NADP_AMNet': {'max_time_dl': 10},
'US_NADP_AMoN': {'max_time_dl': 7},
'MEXICO_CDMX': {'max_time_dl': 10},
'NOAA_ISD': {'max_time_dl': 15},
'MITECO': {'max_time_dl': 10},
'EANET': {'max_time_dl': 5},
'CHILE_SINCA': {'max_time_dl': 30}}
# download data # download data
...@@ -71,7 +77,7 @@ if __name__ == "__main__": ...@@ -71,7 +77,7 @@ if __name__ == "__main__":
dl_metadata = True dl_metadata = True
# networks you want to download # networks you want to download
networks = [US_NADP_AMNet_download] networks = [CHILE_SINCA_download]
# download all networks # download all networks
#networks = ['all'] #networks = ['all']
...@@ -107,8 +113,6 @@ if __name__ == "__main__": ...@@ -107,8 +113,6 @@ if __name__ == "__main__":
if dl_data == True: if dl_data == True:
network.download_data(mode, version, n_max_tries, max_time_per_dl) network.download_data(mode, version, n_max_tries, max_time_per_dl)
pass
if dl_metadata == True: if dl_metadata == True:
network.download_metadata(n_max_tries, max_time_per_dl) network.download_metadata(n_max_tries, max_time_per_dl)
pass