Commits (2)
......@@ -171,20 +171,21 @@ def download_metadata(n_max_tries, max_time_per_dl):
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
"""# create json from original metadata file
# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/CAPMoN/metadata/network_provided/CAPMoN_META.csv', 'r', encoding='ISO-8859-1') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['SiteName_NomDuSite']
key = row['ID']+'_'+row['Measurements_Mesures'].replace('"', '')[:3]
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/CAPMoN/metadata/processed/CAPMoN_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))"""
f.write(json.dumps(json_metadata, indent=4))
# create json in desired shape from current metadata file
......@@ -193,7 +194,7 @@ def download_metadata(n_max_tries, max_time_per_dl):
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['SiteName_NomDuSite']
key = row['ID']+'_'+row['Measurements_Mesures'].replace('"', '')[:3]
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
......@@ -218,11 +219,18 @@ def download_metadata(n_max_tries, max_time_per_dl):
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
......
This diff is collapsed.
......@@ -15,14 +15,30 @@ import zipfile
import os.path
import os
import pandas as pd
def scraper(mode, version):
def download_data(mode, version, n_max_tries, max_time_per_dl):
url = 'https://monitoring.eanet.asia/document/menu/index#publicData'
download_location = "/esarchive/obs/ghost/EANET/original_files/{}/".format(version)
today = date.today().strftime('%Y%m%d') #+ timedelta(days = 1)).strftime('%Y%m%d') # problem with timezones???
#print(today)
if mode == 'all':
bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts
edate = date.today()
os.makedirs('/esarchive/obs/ghost/EANET/original_files/{}/'.format(version), exist_ok=True)
download_location = "/esarchive/obs/ghost/EANET/original_files/{}/".format(version)
elif mode == 'nrt':
print("EANET no nrt")
exit()
else:
print('time mode inapplicable')
exit()
options = Options()
prefs = {'download.default_directory' : download_location}
......@@ -32,17 +48,17 @@ def scraper(mode, version):
driver = webdriver.Chrome(service=svc, options=options)
driver.get(url)
time.sleep(2)
time.sleep(max_time_per_dl)
# login
email = driver.find_element(By.ID, "email")
email.send_keys("raphael.grodofzig@bsc.es")
passwd = driver.find_element(By.ID, "passwd")
passwd.send_keys("274s9QZ5")
time.sleep(2)
time.sleep(max_time_per_dl)
driver.find_element(By.NAME, "submitBtn").click()
time.sleep(3)
time.sleep(max_time_per_dl)
# find countries
dropdown_element = driver.find_element(By.ID, 'countryCd')
......@@ -104,4 +120,109 @@ def scraper(mode, version):
i=i+1
driver.close()
\ No newline at end of file
driver.close()
def download_metadata(n_max_tries, max_time_per_dl):
url_metadata = 'https://www.eanet.asia/wp-content/uploads/2024/01/Site_Information_Acid_Deposition_Monitoring_NMP2023_1117.xlsm'
download_location = "/esarchive/obs/ghost/EANET/metadata/network_provided/EANET_META_{}.csv"
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url_metadata, timeout=max_time_per_dl, headers=Headers)
if r.status_code == 200:
with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
metadata = pd.read_excel(download_location.format(today.strftime('%Y%m%d')), engine='pyxlsb').fillna('')
print(metadata)
"""
# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/EANET/metadata/network_provided/EANET_META.csv', 'r', encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'r', encoding='ISO-8859-1') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('{} not in new metadata file'.format(parameter))
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))"""
\ No newline at end of file
import requests
import time
from datetime import date
from datetime import timedelta
import zipfile
import urllib
import os.path
import os
import pandas as pd
def scraper(mode):
url = 'https://monitoring.eanet.asia/document/menu/index#publicData'
download_url = 'https://www.eanet.asia/wp-content/uploads/2024/01/Site_Information_Acid_Deposition_Monitoring_NMP2023_1117.xlsm'
download_location = "/esarchive/obs/ghost/EANET/metadata/network_provided/"
today = date.today().strftime('%Y%m%d') #+ timedelta(days = 1)).strftime('%Y%m%d') # problem with timezones???
"""
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
r = requests.get(download_url, timeout=120, headers=Headers)
print(r.status_code)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+"downloaded_metadata.xlsm")
print('Downloaded metadata')
else:
print('url status not ok')"""
# open file
metadata = pd.read_excel(download_location+"downloaded_metadata.xlsm", engine='pyxlsb').fillna('')
print(metadata)
......@@ -8,20 +8,20 @@ import urllib
import tarfile
import shutil
import gzip
import csv
import json
def scraper(mode, version):
def download_data(mode, version, n_max_tries, max_time_per_dl):
base_url = 'http://www.aire.cdmx.gob.mx/opendata/anuales_horarios_gz/contaminantes_{}.csv.gz'
if mode == 'all':
bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts
edate = date.today()
os.makedirs('/esarchive/obs/ghost/MEXICO_CDMX/original_files/{}/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/MEXICO_CDMX/original_files/'+version+'/contaminantes_{}.csv.gz'
elif mode == 'nrt':
......@@ -39,21 +39,143 @@ def scraper(mode, version):
# download
for year in years:
url = base_url.format(year)
r = requests.get(url, timeout=120)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location.format(year))
print('Downloaded {}'.format(url))
n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url, timeout=max_time_per_dl)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location.format(year))
print('Downloaded {}'.format(url))
# unzip
with gzip.open(download_location.format(year), 'rb') as f_in:
with open(download_location.format(year)[:-3], 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
# remove files
os.remove(download_location.format(year))
errcode = r.status_code
elif r.status_code == 404:
print("No data found, error 404, year {}".format(year))
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(year))
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# unzip
with gzip.open(download_location.format(year), 'rb') as f_in:
with open(download_location.format(year)[:-3], 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
def download_metadata(n_max_tries, max_time_per_dl):
# remove files
os.remove(download_location.format(year))
url_metadata = 'http://www.aire.cdmx.gob.mx/opendata/catalogos/cat_estacion.csv'
download_location = "/esarchive/obs/ghost/MEXICO_CDMX/metadata/network_provided/MEXICO_CDMX_META_{}.csv"
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url_metadata, timeout=max_time_per_dl)
if r.status_code == 200:
with open(download_location.format('_unformatted'), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
print('No {}'.format(url))
time.sleep(1)
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# import it as pandas to clean header
meta_file = pd.read_csv(download_location.format('_unformatted'), header=[1], encoding='ISO-8859-1')
meta_file.to_csv(download_location.format(today.strftime('%Y%m%d')), index=False)
os.remove(download_location.format('_unformatted'))
# create json from original metadata file
"""json_metadata = {}
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/network_provided/MEXICO_CDMX_META.csv', 'r', encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/processed/MEXICO_CDMX_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/processed/MEXICO_CDMX_META.json', 'r', encoding='ISO-8859-1') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('{} not in new metadata file'.format(parameter))
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/processed/MEXICO_CDMX_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
\ No newline at end of file
......@@ -22,22 +22,18 @@ from selenium.webdriver.support import expected_conditions as EC
def scraper(mode, version):
def download_data(mode, version, n_max_tries, max_time_per_dl):
baseurl = 'https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos-2001-2021.html'
if mode == 'all':
bdate = date(2001, 1, 1) #date(1960, 1, 1) # date before record starts
edate = date.today()
os.makedirs('/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version)
elif mode == 'nrt':
bdate = date(2024, 3, 2) #date.today() - timedelta(days = 1) # if code is run after 2 am, data from previous day will be available
edate = date(2024, 3, 3) #date.today() - timedelta(days = 1)
print("nrt not available")
download_location = '/esarchive/obs/ghost/MITECO/original_files/nrt/'
else:
......@@ -70,23 +66,40 @@ def scraper(mode, version):
for zip_link in zip_links:
filename = zip_link.get("href").rpartition('/')[-1]
url = 'https://www.miteco.gob.es/{}'.format(zip_link.get("href"))
n_tries = 0
errcode = 999
r = requests.get(url, timeout=120)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+filename)
print('Downloaded {}'.format(filename))
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url, timeout=max_time_per_dl)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+filename)
print('Downloaded {}'.format(filename))
# unzip
with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
zip_ref.extractall(download_location)
os.remove(download_location+filename)
# unzip
with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
zip_ref.extractall(download_location)
os.remove(download_location+filename)
else:
print('No {}'.format(url))
errcode = r.status_code
elif r.status_code == 404:
print("No data found, error 404")
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(url))
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# go to hyperlinks
......@@ -118,21 +131,41 @@ def scraper(mode, version):
os.remove(zip_file)
continue
n_tries = 0
errcode = 999
r = requests.get(url, timeout=120)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+filename)
print('Downloaded {}'.format(filename))
# unzip
with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
zip_ref.extractall(download_location)
os.remove(download_location+filename)
else:
print('No {}'.format(url))
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url, timeout=max_time_per_dl)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+filename)
print('Downloaded {}'.format(filename))
# unzip
with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
zip_ref.extractall(download_location)
os.remove(download_location+filename)
errcode = r.status_code
elif r.status_code == 404:
print("No data found, error 404")
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(url))
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# delete metadata
......@@ -152,3 +185,96 @@ def scraper(mode, version):
driver.close()
def download_metadata(n_max_tries, max_time_per_dl):
url_metadata = 'https://www.miteco.gob.es/content/dam/miteco/es/calidad-y-evaluacion-ambiental/sgalsi/atm%C3%B3sfera-y-calidad-del-aire/evaluaci%C3%B3n-2022/Metainformacion2022.xlsx'
download_location = "/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META_{}.xlsx"
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
r = requests.get(url_metadata, headers=Headers, timeout=max_time_per_dl)
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
if r.status_code == 200:
with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# convert to csv
file = pd.read_excel(download_location.format(today.strftime('%Y%m%d')))
file.to_csv('/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META_{}.csv'.format(today.strftime('%Y%m%d')), index=False, header=True)
"""# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META.csv', 'r', encoding='ISO-8859-1') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['SiteName_NomDuSite']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='ISO-8859-1') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['SiteName_NomDuSite']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'r', encoding='ISO-8859-1') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1]), json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# safe
with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'w', encoding='ISO-8859-1') as f:
f.write(json.dumps(json_metadata, indent=4))"""
......@@ -7,20 +7,21 @@ import re
import os
from datetime import date
from datetime import timedelta
import requests
import csv
import json
import time
def scraper(mode, version):
def download_data(mode, version, n_max_tries, max_time_per_dl):
if mode == 'all':
start_year = 1971
end_year = 2024
elif mode == 'nrt':
start_year = date.today().strftime('%Y')
end_year = (date.today() + timedelta(days=365)).strftime('%Y')
version = mode
else:
......@@ -36,7 +37,7 @@ def scraper(mode, version):
read_url = False
while read_url == False:
try:
link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=15, cafile=certifi.where()).read().decode('utf-8-sig'))
link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=max_time_per_dl, cafile=certifi.where()).read().decode('utf-8-sig'))
read_url = True
except HTTPError as error:
print('Data not retrieved because %s\nURL: %s'%(error, link_url))
......@@ -57,11 +58,10 @@ def scraper(mode, version):
#handles issue of server hanging for 3 minutes spoaradically
#try downloading each link a certain number of times before giving up
n_tries_limit = 3
for link in link_list:
n_tries = 0
errcode = 999
while (n_tries < n_tries_limit) & (errcode != 0):
while (n_tries < n_max_tries) & (errcode != 0):
if n_tries == 0:
print('Checking/Downloading %s'%(link))
else:
......@@ -91,4 +91,103 @@ def scraper(mode, version):
cmd = 'rm {}/{}'.format(specific_directory,lnk)
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
status = process.communicate()[0]
errcode = process.returncode
\ No newline at end of file
errcode = process.returncode
def download_metadata(n_max_tries, max_time_per_dl):
url_metadata = 'https://www.ncei.noaa.gov/pub/data/noaa/isd-history.csv'
download_location = "/esarchive/obs/ghost/NOAA_ISD/metadata/network_provided/NOAA_ISD_META_{}.csv"
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url_metadata, timeout=max_time_per_dl)
if r.status_code == 200:
with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
"""
# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/network_provided/NOAA_ISD_META.csv', 'r', encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['USAF']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['USAF']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'r', encoding='utf-8') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('{} not in new metadata file'.format(parameter))
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
\ No newline at end of file
......@@ -47,15 +47,14 @@ def download_data(mode, version, n_max_tries, max_time_per_dl):
options.add_argument("--no-sandbox")
#options.add_argument("--headless")
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
try:
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
# open url
driver.get(baseurl)
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded
......@@ -105,14 +104,11 @@ def download_data(mode, version, n_max_tries, max_time_per_dl):
print("Number of tries: {}".format(n_tries))
continue
driver.close()
driver.close()
if n_tries == n_max_tries:
print('Failed downloading US_NADP_AMNet data {} times in {} seconds'.format(n_tries, max_time_per_dl))
print(os.path.split(download_location[:-5]))
os.rename("{}AMNET-ALL-h.csv".format(download_location), "/esarchive/obs/ghost/US_NADP_AMNet/original_files/{}/AMNET-ALL-h.csv".format(version))
......@@ -229,7 +225,7 @@ def download_metadata(n_max_tries, max_time_per_dl):
for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("new {} --- old {}".format(json_metadata_now[station][parameter]['values'][0], json_metadata[station][parameter]['values'][-1]))
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
......
......@@ -8,9 +8,9 @@ import pandas
import os.path
import urllib
import time
import ssl
import zipfile
from compare_two_files import compare_files
import json
import csv
from chromedriver_py import binary_path
from selenium.webdriver.chrome.options import Options
......@@ -22,22 +22,16 @@ from selenium.webdriver.support import expected_conditions as EC
def scraper(mode, version):
def download_data(mode, version, n_max_tries, max_time_per_dl):
baseurl = 'https://nadp.slh.wisc.edu/networks/ammonia-monitoring-network/'
if mode == 'all':
bdate = date(2013, 12, 1) #date(1960, 1, 1) # date before record starts
edate = date(2024, 1, 1) #date.today() - timedelta(days = 1)
os.makedirs('/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/'.format(version)
os.makedirs('/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/temp/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/temp/'.format(version)
elif mode == 'nrt':
bdate = date(2024, 1, 1) #date.today() - timedelta(days = 1)
edate = date.today() - timedelta(days = 1)
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/nrt/'
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/nrt/temp/'
else:
print('time mode inapplicable')
......@@ -52,38 +46,198 @@ def scraper(mode, version):
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
try:
# open url
driver.get(baseurl)
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded
dropdown_element = driver.find_element(By.ID, 'data-type')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
select.select_by_visible_text("Bi-weekly")
time.sleep(max_time_per_dl)
dropdown_element = driver.find_element(By.ID, 'sites-list')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
select.select_by_visible_text("All Sites")
time.sleep(max_time_per_dl)
invalid_box = driver.find_element(By.ID, 'invalid')
invalid_box.click()
time.sleep(max_time_per_dl)
# download
driver.find_element(By.ID, 'generate-button-text').click()
# wait until download finished
while not os.path.exists("{}AMoN-ALL-W-i.csv".format(download_location)):
time.sleep(1)
if os.path.isfile("{}AMoN-ALL-W-i.csv".format(download_location)):
print('AMoN-ALL-W-i.csv download successful')
errcode = 200
continue
except TimeoutException as e:
print(e)
max_time_per_dl = max_time_per_dl*2 # set waiting time to double
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
except:
print("Unknown error")
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
driver.close()
if n_tries == n_max_tries:
print('Failed downloading US_NADP_AMoN data {} times in {} seconds'.format(n_tries, max_time_per_dl))
os.rename("{}AMoN-ALL-W-i.csv".format(download_location), "/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/AMoN-ALL-W-i.csv".format(version))
def download_metadata(n_max_tries, max_time_per_dl):
# open url
driver.get(baseurl)
WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded
dropdown_element = driver.find_element(By.ID, 'data-type')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
print(options)
select.select_by_visible_text("Bi-weekly")
time.sleep(3)
dropdown_element = driver.find_element(By.ID, 'sites-list')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
print(options)
select.select_by_visible_text("All Sites")
time.sleep(3)
invalid_box = driver.find_element(By.ID, 'invalid')
invalid_box.click()
time.sleep(3)
# download
driver.find_element(By.ID, 'generate-button-text').click()
baseurl = 'https://nadp.slh.wisc.edu/networks/ammonia-monitoring-network/'
#os.makedirs('/esarchive/obs/ghost/US_NADP_AMNet/metadata/network_provided/', exist_ok=True)
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/metadata/network_provided/'
today = date.today()
# wait until download finished
while not os.path.exists("{}AMoN-ALL-W-i.csv".format(download_location)):
time.sleep(1)
# set up driver
options = Options()
prefs = {'download.default_directory' : download_location}
options.add_experimental_option('prefs', prefs)
options.add_argument("--no-sandbox")
options.add_argument("--headless")
if os.path.isfile("{}AMoN-ALL-W-i.csv".format(download_location)):
print('AMoN-ALL-W-i.csv download successful')
n_tries = 0
errcode = 999
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
driver.maximize_window()
while (n_tries < n_max_tries) and (errcode != 200):
try:
# open url
driver.get(baseurl)
#WebDriverWait(driver, max_time_per_dl).until(EC.element_to_be_clickable((By.ID, 'invalid'))) # wait till loaded
time.sleep(max_time_per_dl)
invalid_box = driver.find_element(By.ID, 'download-show-inactive')
driver.execute_script("arguments[0].click()", invalid_box)
# download
#WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'generate-button-text'))) # wait till loaded
time.sleep(max_time_per_dl)
bttn = driver.find_element(By.ID, 'network-data-submit')
driver.execute_script("arguments[0].click()", bttn)
# wait until download finished
while not os.path.exists(download_location+'amon.csv'):
time.sleep(1)
if os.path.isfile(download_location+'amon.csv'):
print('Amon metadata download successful')
errcode = 200
continue
except TimeoutException as e:
print(e)
max_time_per_dl = max_time_per_dl*2 # set waiting time to double
n_tries = n_tries+1
continue
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
except:
print("Unknown error")
max_time_per_dl = max_time_per_dl*2 # set waiting time to double
n_tries = n_tries+1
continue
if n_tries == n_max_tries:
print('Failed downloading AMoN metadata {} times in {} seconds'.format(n_tries, max_time_per_dl))
driver.close()
os.rename(download_location+'amon.csv', download_location+'US_NADP_AMoN_META_{}.csv'.format(today.strftime('%Y%m%d')))
# create json from original metadata file =====================================================================================
"""json_metadata = {}
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/network_provided/US_NADP_AMoN_META.csv', 'r') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['siteId']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/processed/US_NADP_AMoN_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location+'US_NADP_AMoN_META_{}.csv'.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['siteId']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/processed/US_NADP_AMoN_META.json', 'r', encoding='utf-8') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# safe
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/processed/US_NADP_AMoN_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
driver.close()
\ No newline at end of file
import hashlib # works for all type of data
import requests
def request_download(url, max_time_per_dl, download_location):
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
n_tries = 0
errcode = 999
r = requests.get(url, headers=Headers, timeout=max_time_per_dl)
while (n_tries < n_max_tries) and (errcode != 200):
if r.status_code == 200:
open(download_location + file.format(year), "wb").write(r.content)
print('Downloaded {}'.format(file.format(year)))
errcode = r.status_code
elif r.status_code == 404:
print("No ozone l data found, error 404")
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(file.format(year)))
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# check if files are different
......
......@@ -61,7 +61,13 @@ if __name__ == "__main__":
'CNEMC': {'max_time_dl': 3},
'CANADA_NAPS': {'max_time_dl': 5},
'CAPMoN': {'max_time_dl': 5},
'US_NADP_AMNet': {'max_time_dl': 10}}
'US_NADP_AMNet': {'max_time_dl': 10},
'US_NADP_AMoN': {'max_time_dl': 7},
'MEXICO_CDMX': {'max_time_dl': 10},
'NOAA_ISD': {'max_time_dl': 15},
'MITECO': {'max_time_dl': 10},
'EANET': {'max_time_dl': 5},
'CHILE_SINCA': {'max_time_dl': 30}}
# download data
......@@ -71,7 +77,7 @@ if __name__ == "__main__":
dl_metadata = True
# networks you want to download
networks = [US_NADP_AMNet_download]
networks = [CHILE_SINCA_download]
# download all networks
#networks = ['all']
......@@ -107,8 +113,6 @@ if __name__ == "__main__":
if dl_data == True:
network.download_data(mode, version, n_max_tries, max_time_per_dl)
pass
if dl_metadata == True:
network.download_metadata(n_max_tries, max_time_per_dl)
pass