Commits (2)
......@@ -171,20 +171,21 @@ def download_metadata(n_max_tries, max_time_per_dl):
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
"""# create json from original metadata file
# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/CAPMoN/metadata/network_provided/CAPMoN_META.csv', 'r', encoding='ISO-8859-1') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['SiteName_NomDuSite']
key = row['ID']+'_'+row['Measurements_Mesures'].replace('"', '')[:3]
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/CAPMoN/metadata/processed/CAPMoN_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))"""
f.write(json.dumps(json_metadata, indent=4))
# create json in desired shape from current metadata file
......@@ -193,7 +194,7 @@ def download_metadata(n_max_tries, max_time_per_dl):
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['SiteName_NomDuSite']
key = row['ID']+'_'+row['Measurements_Mesures'].replace('"', '')[:3]
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
......@@ -218,11 +219,18 @@ def download_metadata(n_max_tries, max_time_per_dl):
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
......
......@@ -5,10 +5,11 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
import requests
import time
......@@ -23,14 +24,16 @@ import zipfile
import shutil
import os
import re
import csv
import json
def scraper(mode, version):
def download_data(mode, version, n_max_tries, max_time_per_dl):
# paths and variables
variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu'] # complete list later
variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu']
variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre']
time_resolutions_website = ["registro diario", "registro horario"] # complete later
time_resolutions_website = ["registro diario", "registro horario"]
variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website!
time_resolution_ghost = ['daily', 'hourly']
......@@ -39,15 +42,28 @@ def scraper(mode, version):
baseurl = 'https://sinca.mma.gob.cl/index.php/'
# only for nrt
bdate = "240101"
if mode == 'all':
# create download directory
os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)
elif mode == 'nrt':
bdate = date(date.today().year, 1, 1).strftime('%Y%m%d')[2:] #"240101"
edate = date.today().strftime('%Y%m%d')[2:]
print(edate)
# create download directory
version = mode
os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)
else:
print('time mode inapplicable')
n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
try:
# set up driver
options = Options()
prefs = {'download.default_directory' : download_location}
......@@ -57,12 +73,14 @@ def scraper(mode, version):
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu")
options.add_argument("--disable-dev-shm-usage")
if n_tries > 0:
options.add_argument("--headless")
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
# open url
driver.get(baseurl)
WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded
# navigate to regions
html = driver.page_source
......@@ -145,7 +163,7 @@ def scraper(mode, version):
driver.switch_to.frame("right")
time.sleep(10)
WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.LINK_TEXT, "Excel CSV"))) # wait till loaded
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.LINK_TEXT, "Excel CSV"))) # wait till loaded
driver.find_element(By.LINK_TEXT, "Excel CSV").click()
......@@ -162,43 +180,71 @@ def scraper(mode, version):
driver.switch_to.default_content()
driver.switch_to.frame("left")
i=i+1
driver.close()
errcode = 200
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
if n_tries == n_max_tries:
print('Failed downloading CHILE_SINCA data {} times in {} seconds'.format(n_tries, max_time_per_dl))
driver.close()
def scraper_metadata(mode, version):
def download_metadata(n_max_tries, max_time_per_dl):
# paths and variables
variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu']
variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre']
time_resolutions_website = ["registro diario", "registro horario"]
variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website!
time_resolution_ghost = ['daily', 'hourly']
baseurl = 'https://sinca.mma.gob.cl/index.php/'
today = date.today()
metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/temp/CHILE_SINCA_META_temp.txt")
print(metadata15)
metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/temp/CHILE_SINCA_META_temp.txt")
#print(metadata15)
instruments_old = ['O3_instrument','NO_instrument','NO2_instrument','CO_instrument','CH4_instrument','SO2_instrument','NMHC_instrument','HC_instrument','PM10_instrument','PM2.5_instrument','As_instrument','Cu_instrument','Pb_instrument']
"""
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'r', encoding='utf-8') as f:
metadata_old = json.loads(f.read())
n_tries = 0
errcode = 999
metadata_new = {}
while (n_tries < n_max_tries) and (errcode != 200):
try:
# set up driver
options = Options()
#prefs = {'download.default_directory' : download_location}
#options.add_experimental_option('prefs', prefs)
options.add_argument("--no-sandbox")
options.add_argument("--headless")
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
# open url
driver.get(baseurl)
WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded
# navigate to regions
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
#regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
for region in regions:
print("Region is "+region.getText())
driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
time.sleep(3)
time.sleep(1)
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
......@@ -207,8 +253,8 @@ def scraper_metadata(mode, version):
for station in stations:
station_name = station.getText()
print(station_name)
station_name_new = station.getText()
print(station_name_new)
driver.get("https://sinca.mma.gob.cl/"+station.get("href"))
time.sleep(3)
......@@ -220,29 +266,121 @@ def scraper_metadata(mode, version):
region = soup.find("th", text="Región").find_next_sibling().getText()
province = soup.find("th", text="Provincia").find_next_sibling().getText()
commune = soup.find("th", text="Comuna").find_next_sibling().getText()
UTM_coordinates = soup.find("th", text="Coordenadas UTM").find_next_sibling().getText()
timezone = soup.find("th", text="Huso horario").find_next_sibling().getText()
scraped_metadata = [station_reference, station_name, region, province, commune, UTM_coordinates, timezone]
UTM_coordinates = soup.find("th", text="Coordenadas UTM").find_next_sibling().getText().replace(' ', '')
lon = UTM_coordinates.split('E')[0]+'E'
lat = UTM_coordinates.split('E')[1].split("\n")[0]
timezone = soup.find("th", text="Huso horario").find_next_sibling().getText().replace(' ', '')
ins_table = soup.find('table', id="medicion")
if ins_table is not None: # check if there are instruments for air pollution at this station
instruments = ins_table.find_all("td", {"class": "helpTecnica center"})
instruments_per_component = {}
else:
continue
metadata15_per_station = metadata15.loc[metadata15["station_name"] == station_name]
print(region)
print(metadata15_per_station)
print(metadata15_per_station["region"].iloc[0])
for instrument in instruments:
component = instrument.find_parent().find('a').getText()
try: # rename
component = variables_ghost[variables_text.index(component)]
except:
try:
component = variables_ghost[variables_website.index(component)]
except:
pass
i=0
for column in metadata15_per_station.head():
print(column)
if 'Ozono.-' in component:
component = 'O3'
if metadata15_per_station[column].iloc[0] == scraped_metadata[i]:
print("ok!")
#======
if "No informado" in instrument.getText():
instruments_per_component[component] = ''
else:
print("not ok")
instrument_name = re.sub(' +', ' ', instrument.getText())
instrument_name = instrument_name.split("\n")[-1]
instruments_per_component[component] = instrument_name
for station_reference in metadata_old:
if metadata_old[station_reference]['station_name']['values'][0] == station_name_new: # match station with previously referenced station reference from old file
i=0
metadata_new[station_reference] = {} # create inner dictionary
scraped_metadata = [station_reference, station_name_new, region, province, commune, lon, lat, timezone]
for parameter in metadata_old[station_reference]: # loop through the meta parameters
if ("instrument" not in parameter) and ("comments" not in parameter): # go through all that are not instruments
metadata_new[station_reference][parameter] = {"values": [scraped_metadata[i]], "update_time": [today.strftime('%Y-%m-%d')]}
i=i+1
elif "comments" == parameter:
metadata_new[station_reference][parameter] = {"values": [''], "update_time": [today.strftime('%Y-%m-%d')]}
else: # go through the instruments
for component in instruments_per_component:
if component in parameter:
metadata_new[station_reference][parameter] = {"values": [instruments_per_component[component]], "update_time": [today.strftime('%Y-%m-%d')]}
else:
metadata_new[station_reference][parameter] = {"values": [''], "update_time": [today.strftime('%Y-%m-%d')]}
# safe
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META_{}.json'.format(today.strftime('%Y%m%d')), 'w', encoding='utf-8') as f:
f.write(json.dumps(metadata_new, indent=4, ensure_ascii=False))
driver.close()
errcode = 200
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
if n_tries == n_max_tries:
print('Failed downloading CHILE_SINCA metadata {} times in {} seconds'.format(n_tries, max_time_per_dl))"""
"""
# create json from original metadata file =====================================================================================
json_metadata = {}
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META.txt', 'r') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['station_reference']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
driver.close()
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# read newly scraped file
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META_{}.json'.format(today.strftime('%Y%m%d')), 'r', encoding='utf-8') as f:
json_metadata_now = json.loads(f.read())
# read standardised file to compare!
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'r', encoding='utf-8') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# safe
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
\ No newline at end of file
......@@ -15,14 +15,30 @@ import zipfile
import os.path
import os
import pandas as pd
def scraper(mode, version):
def download_data(mode, version, n_max_tries, max_time_per_dl):
url = 'https://monitoring.eanet.asia/document/menu/index#publicData'
download_location = "/esarchive/obs/ghost/EANET/original_files/{}/".format(version)
today = date.today().strftime('%Y%m%d') #+ timedelta(days = 1)).strftime('%Y%m%d') # problem with timezones???
#print(today)
if mode == 'all':
bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts
edate = date.today()
os.makedirs('/esarchive/obs/ghost/EANET/original_files/{}/'.format(version), exist_ok=True)
download_location = "/esarchive/obs/ghost/EANET/original_files/{}/".format(version)
elif mode == 'nrt':
print("EANET no nrt")
exit()
else:
print('time mode inapplicable')
exit()
options = Options()
prefs = {'download.default_directory' : download_location}
......@@ -32,17 +48,17 @@ def scraper(mode, version):
driver = webdriver.Chrome(service=svc, options=options)
driver.get(url)
time.sleep(2)
time.sleep(max_time_per_dl)
# login
email = driver.find_element(By.ID, "email")
email.send_keys("raphael.grodofzig@bsc.es")
passwd = driver.find_element(By.ID, "passwd")
passwd.send_keys("274s9QZ5")
time.sleep(2)
time.sleep(max_time_per_dl)
driver.find_element(By.NAME, "submitBtn").click()
time.sleep(3)
time.sleep(max_time_per_dl)
# find countries
dropdown_element = driver.find_element(By.ID, 'countryCd')
......@@ -105,3 +121,108 @@ def scraper(mode, version):
i=i+1
driver.close()
def download_metadata(n_max_tries, max_time_per_dl):
url_metadata = 'https://www.eanet.asia/wp-content/uploads/2024/01/Site_Information_Acid_Deposition_Monitoring_NMP2023_1117.xlsm'
download_location = "/esarchive/obs/ghost/EANET/metadata/network_provided/EANET_META_{}.csv"
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url_metadata, timeout=max_time_per_dl, headers=Headers)
if r.status_code == 200:
with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
metadata = pd.read_excel(download_location.format(today.strftime('%Y%m%d')), engine='pyxlsb').fillna('')
print(metadata)
"""
# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/EANET/metadata/network_provided/EANET_META.csv', 'r', encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'r', encoding='ISO-8859-1') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('{} not in new metadata file'.format(parameter))
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))"""
\ No newline at end of file
import requests
import time
from datetime import date
from datetime import timedelta
import zipfile
import urllib
import os.path
import os
import pandas as pd
def scraper(mode):
url = 'https://monitoring.eanet.asia/document/menu/index#publicData'
download_url = 'https://www.eanet.asia/wp-content/uploads/2024/01/Site_Information_Acid_Deposition_Monitoring_NMP2023_1117.xlsm'
download_location = "/esarchive/obs/ghost/EANET/metadata/network_provided/"
today = date.today().strftime('%Y%m%d') #+ timedelta(days = 1)).strftime('%Y%m%d') # problem with timezones???
"""
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
r = requests.get(download_url, timeout=120, headers=Headers)
print(r.status_code)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+"downloaded_metadata.xlsm")
print('Downloaded metadata')
else:
print('url status not ok')"""
# open file
metadata = pd.read_excel(download_location+"downloaded_metadata.xlsm", engine='pyxlsb').fillna('')
print(metadata)
......@@ -8,20 +8,20 @@ import urllib
import tarfile
import shutil
import gzip
import csv
import json
def scraper(mode, version):
def download_data(mode, version, n_max_tries, max_time_per_dl):
base_url = 'http://www.aire.cdmx.gob.mx/opendata/anuales_horarios_gz/contaminantes_{}.csv.gz'
if mode == 'all':
bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts
edate = date.today()
os.makedirs('/esarchive/obs/ghost/MEXICO_CDMX/original_files/{}/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/MEXICO_CDMX/original_files/'+version+'/contaminantes_{}.csv.gz'
elif mode == 'nrt':
......@@ -39,21 +39,143 @@ def scraper(mode, version):
# download
for year in years:
url = base_url.format(year)
r = requests.get(url, timeout=120)
n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url, timeout=max_time_per_dl)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location.format(year))
print('Downloaded {}'.format(url))
# unzip
with gzip.open(download_location.format(year), 'rb') as f_in:
with open(download_location.format(year)[:-3], 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
# remove files
os.remove(download_location.format(year))
errcode = r.status_code
elif r.status_code == 404:
print("No data found, error 404, year {}".format(year))
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(year))
errcode = 200
else:
print('No {}'.format(url))
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1)
def download_metadata(n_max_tries, max_time_per_dl):
url_metadata = 'http://www.aire.cdmx.gob.mx/opendata/catalogos/cat_estacion.csv'
download_location = "/esarchive/obs/ghost/MEXICO_CDMX/metadata/network_provided/MEXICO_CDMX_META_{}.csv"
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url_metadata, timeout=max_time_per_dl)
if r.status_code == 200:
with open(download_location.format('_unformatted'), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# import it as pandas to clean header
meta_file = pd.read_csv(download_location.format('_unformatted'), header=[1], encoding='ISO-8859-1')
meta_file.to_csv(download_location.format(today.strftime('%Y%m%d')), index=False)
os.remove(download_location.format('_unformatted'))
# create json from original metadata file
"""json_metadata = {}
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/network_provided/MEXICO_CDMX_META.csv', 'r', encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/processed/MEXICO_CDMX_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/processed/MEXICO_CDMX_META.json', 'r', encoding='ISO-8859-1') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('{} not in new metadata file'.format(parameter))
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/processed/MEXICO_CDMX_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
\ No newline at end of file
......@@ -22,22 +22,18 @@ from selenium.webdriver.support import expected_conditions as EC
def scraper(mode, version):
def download_data(mode, version, n_max_tries, max_time_per_dl):
baseurl = 'https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos-2001-2021.html'
if mode == 'all':
bdate = date(2001, 1, 1) #date(1960, 1, 1) # date before record starts
edate = date.today()
os.makedirs('/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version)
elif mode == 'nrt':
bdate = date(2024, 3, 2) #date.today() - timedelta(days = 1) # if code is run after 2 am, data from previous day will be available
edate = date(2024, 3, 3) #date.today() - timedelta(days = 1)
print("nrt not available")
download_location = '/esarchive/obs/ghost/MITECO/original_files/nrt/'
else:
......@@ -70,23 +66,40 @@ def scraper(mode, version):
for zip_link in zip_links:
filename = zip_link.get("href").rpartition('/')[-1]
url = 'https://www.miteco.gob.es/{}'.format(zip_link.get("href"))
n_tries = 0
errcode = 999
r = requests.get(url, timeout=120)
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url, timeout=max_time_per_dl)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+filename)
print('Downloaded {}'.format(filename))
# unzip
with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
zip_ref.extractall(download_location)
os.remove(download_location+filename)
else:
print('No {}'.format(url))
errcode = r.status_code
time.sleep(1)
elif r.status_code == 404:
print("No data found, error 404")
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(url))
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# go to hyperlinks
......@@ -119,7 +132,11 @@ def scraper(mode, version):
continue
r = requests.get(url, timeout=120)
n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url, timeout=max_time_per_dl)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+filename)
print('Downloaded {}'.format(filename))
......@@ -129,10 +146,26 @@ def scraper(mode, version):
zip_ref.extractall(download_location)
os.remove(download_location+filename)
errcode = r.status_code
else:
print('No {}'.format(url))
elif r.status_code == 404:
print("No data found, error 404")
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(url))
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# delete metadata
......@@ -152,3 +185,96 @@ def scraper(mode, version):
driver.close()
def download_metadata(n_max_tries, max_time_per_dl):
url_metadata = 'https://www.miteco.gob.es/content/dam/miteco/es/calidad-y-evaluacion-ambiental/sgalsi/atm%C3%B3sfera-y-calidad-del-aire/evaluaci%C3%B3n-2022/Metainformacion2022.xlsx'
download_location = "/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META_{}.xlsx"
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
r = requests.get(url_metadata, headers=Headers, timeout=max_time_per_dl)
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
if r.status_code == 200:
with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# convert to csv
file = pd.read_excel(download_location.format(today.strftime('%Y%m%d')))
file.to_csv('/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META_{}.csv'.format(today.strftime('%Y%m%d')), index=False, header=True)
"""# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META.csv', 'r', encoding='ISO-8859-1') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['SiteName_NomDuSite']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='ISO-8859-1') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['SiteName_NomDuSite']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'r', encoding='ISO-8859-1') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1]), json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# safe
with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'w', encoding='ISO-8859-1') as f:
f.write(json.dumps(json_metadata, indent=4))"""
......@@ -7,20 +7,21 @@ import re
import os
from datetime import date
from datetime import timedelta
import requests
import csv
import json
import time
def scraper(mode, version):
def download_data(mode, version, n_max_tries, max_time_per_dl):
if mode == 'all':
start_year = 1971
end_year = 2024
elif mode == 'nrt':
start_year = date.today().strftime('%Y')
end_year = (date.today() + timedelta(days=365)).strftime('%Y')
version = mode
else:
......@@ -36,7 +37,7 @@ def scraper(mode, version):
read_url = False
while read_url == False:
try:
link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=15, cafile=certifi.where()).read().decode('utf-8-sig'))
link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=max_time_per_dl, cafile=certifi.where()).read().decode('utf-8-sig'))
read_url = True
except HTTPError as error:
print('Data not retrieved because %s\nURL: %s'%(error, link_url))
......@@ -57,11 +58,10 @@ def scraper(mode, version):
#handles issue of server hanging for 3 minutes spoaradically
#try downloading each link a certain number of times before giving up
n_tries_limit = 3
for link in link_list:
n_tries = 0
errcode = 999
while (n_tries < n_tries_limit) & (errcode != 0):
while (n_tries < n_max_tries) & (errcode != 0):
if n_tries == 0:
print('Checking/Downloading %s'%(link))
else:
......@@ -92,3 +92,102 @@ def scraper(mode, version):
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
status = process.communicate()[0]
errcode = process.returncode
def download_metadata(n_max_tries, max_time_per_dl):
url_metadata = 'https://www.ncei.noaa.gov/pub/data/noaa/isd-history.csv'
download_location = "/esarchive/obs/ghost/NOAA_ISD/metadata/network_provided/NOAA_ISD_META_{}.csv"
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url_metadata, timeout=max_time_per_dl)
if r.status_code == 200:
with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
"""
# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/network_provided/NOAA_ISD_META.csv', 'r', encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['USAF']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['USAF']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'r', encoding='utf-8') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('{} not in new metadata file'.format(parameter))
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
\ No newline at end of file
......@@ -47,15 +47,14 @@ def download_data(mode, version, n_max_tries, max_time_per_dl):
options.add_argument("--no-sandbox")
#options.add_argument("--headless")
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
try:
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
# open url
driver.get(baseurl)
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded
......@@ -110,9 +109,6 @@ def download_data(mode, version, n_max_tries, max_time_per_dl):
if n_tries == n_max_tries:
print('Failed downloading US_NADP_AMNet data {} times in {} seconds'.format(n_tries, max_time_per_dl))
print(os.path.split(download_location[:-5]))
os.rename("{}AMNET-ALL-h.csv".format(download_location), "/esarchive/obs/ghost/US_NADP_AMNet/original_files/{}/AMNET-ALL-h.csv".format(version))
......@@ -229,7 +225,7 @@ def download_metadata(n_max_tries, max_time_per_dl):
for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("new {} --- old {}".format(json_metadata_now[station][parameter]['values'][0], json_metadata[station][parameter]['values'][-1]))
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
......
......@@ -8,9 +8,9 @@ import pandas
import os.path
import urllib
import time
import ssl
import zipfile
from compare_two_files import compare_files
import json
import csv
from chromedriver_py import binary_path
from selenium.webdriver.chrome.options import Options
......@@ -22,22 +22,16 @@ from selenium.webdriver.support import expected_conditions as EC
def scraper(mode, version):
def download_data(mode, version, n_max_tries, max_time_per_dl):
baseurl = 'https://nadp.slh.wisc.edu/networks/ammonia-monitoring-network/'
if mode == 'all':
bdate = date(2013, 12, 1) #date(1960, 1, 1) # date before record starts
edate = date(2024, 1, 1) #date.today() - timedelta(days = 1)
os.makedirs('/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/'.format(version)
os.makedirs('/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/temp/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/temp/'.format(version)
elif mode == 'nrt':
bdate = date(2024, 1, 1) #date.today() - timedelta(days = 1)
edate = date.today() - timedelta(days = 1)
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/nrt/'
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/nrt/temp/'
else:
print('time mode inapplicable')
......@@ -52,29 +46,30 @@ def scraper(mode, version):
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
try:
# open url
driver.get(baseurl)
WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded
dropdown_element = driver.find_element(By.ID, 'data-type')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
print(options)
select.select_by_visible_text("Bi-weekly")
time.sleep(3)
time.sleep(max_time_per_dl)
dropdown_element = driver.find_element(By.ID, 'sites-list')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
print(options)
select.select_by_visible_text("All Sites")
time.sleep(3)
time.sleep(max_time_per_dl)
invalid_box = driver.find_element(By.ID, 'invalid')
invalid_box.click()
time.sleep(3)
time.sleep(max_time_per_dl)
# download
driver.find_element(By.ID, 'generate-button-text').click()
......@@ -85,5 +80,164 @@ def scraper(mode, version):
if os.path.isfile("{}AMoN-ALL-W-i.csv".format(download_location)):
print('AMoN-ALL-W-i.csv download successful')
errcode = 200
continue
except TimeoutException as e:
print(e)
max_time_per_dl = max_time_per_dl*2 # set waiting time to double
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
except:
print("Unknown error")
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
driver.close()
if n_tries == n_max_tries:
print('Failed downloading US_NADP_AMoN data {} times in {} seconds'.format(n_tries, max_time_per_dl))
os.rename("{}AMoN-ALL-W-i.csv".format(download_location), "/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/AMoN-ALL-W-i.csv".format(version))
def download_metadata(n_max_tries, max_time_per_dl):
baseurl = 'https://nadp.slh.wisc.edu/networks/ammonia-monitoring-network/'
#os.makedirs('/esarchive/obs/ghost/US_NADP_AMNet/metadata/network_provided/', exist_ok=True)
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/metadata/network_provided/'
today = date.today()
# set up driver
options = Options()
prefs = {'download.default_directory' : download_location}
options.add_experimental_option('prefs', prefs)
options.add_argument("--no-sandbox")
options.add_argument("--headless")
n_tries = 0
errcode = 999
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
driver.maximize_window()
while (n_tries < n_max_tries) and (errcode != 200):
try:
# open url
driver.get(baseurl)
#WebDriverWait(driver, max_time_per_dl).until(EC.element_to_be_clickable((By.ID, 'invalid'))) # wait till loaded
time.sleep(max_time_per_dl)
invalid_box = driver.find_element(By.ID, 'download-show-inactive')
driver.execute_script("arguments[0].click()", invalid_box)
# download
#WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'generate-button-text'))) # wait till loaded
time.sleep(max_time_per_dl)
bttn = driver.find_element(By.ID, 'network-data-submit')
driver.execute_script("arguments[0].click()", bttn)
# wait until download finished
while not os.path.exists(download_location+'amon.csv'):
time.sleep(1)
if os.path.isfile(download_location+'amon.csv'):
print('Amon metadata download successful')
errcode = 200
continue
except TimeoutException as e:
print(e)
max_time_per_dl = max_time_per_dl*2 # set waiting time to double
n_tries = n_tries+1
continue
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
except:
print("Unknown error")
max_time_per_dl = max_time_per_dl*2 # set waiting time to double
n_tries = n_tries+1
continue
if n_tries == n_max_tries:
print('Failed downloading AMoN metadata {} times in {} seconds'.format(n_tries, max_time_per_dl))
driver.close()
os.rename(download_location+'amon.csv', download_location+'US_NADP_AMoN_META_{}.csv'.format(today.strftime('%Y%m%d')))
# create json from original metadata file =====================================================================================
"""json_metadata = {}
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/network_provided/US_NADP_AMoN_META.csv', 'r') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['siteId']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/processed/US_NADP_AMoN_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location+'US_NADP_AMoN_META_{}.csv'.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['siteId']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/processed/US_NADP_AMoN_META.json', 'r', encoding='utf-8') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# safe
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/processed/US_NADP_AMoN_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
import hashlib # works for all type of data
import requests
def request_download(url, max_time_per_dl, download_location):
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
n_tries = 0
errcode = 999
r = requests.get(url, headers=Headers, timeout=max_time_per_dl)
while (n_tries < n_max_tries) and (errcode != 200):
if r.status_code == 200:
open(download_location + file.format(year), "wb").write(r.content)
print('Downloaded {}'.format(file.format(year)))
errcode = r.status_code
elif r.status_code == 404:
print("No ozone l data found, error 404")
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(file.format(year)))
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# check if files are different
......
......@@ -61,7 +61,13 @@ if __name__ == "__main__":
'CNEMC': {'max_time_dl': 3},
'CANADA_NAPS': {'max_time_dl': 5},
'CAPMoN': {'max_time_dl': 5},
'US_NADP_AMNet': {'max_time_dl': 10}}
'US_NADP_AMNet': {'max_time_dl': 10},
'US_NADP_AMoN': {'max_time_dl': 7},
'MEXICO_CDMX': {'max_time_dl': 10},
'NOAA_ISD': {'max_time_dl': 15},
'MITECO': {'max_time_dl': 10},
'EANET': {'max_time_dl': 5},
'CHILE_SINCA': {'max_time_dl': 30}}
# download data
......@@ -71,7 +77,7 @@ if __name__ == "__main__":
dl_metadata = True
# networks you want to download
networks = [US_NADP_AMNet_download]
networks = [CHILE_SINCA_download]
# download all networks
#networks = ['all']
......@@ -107,8 +113,6 @@ if __name__ == "__main__":
if dl_data == True:
network.download_data(mode, version, n_max_tries, max_time_per_dl)
pass
if dl_metadata == True:
network.download_metadata(n_max_tries, max_time_per_dl)
pass