Commits (2)
...@@ -171,20 +171,21 @@ def download_metadata(n_max_tries, max_time_per_dl): ...@@ -171,20 +171,21 @@ def download_metadata(n_max_tries, max_time_per_dl):
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode)) print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1) time.sleep(1)
"""# create json from original metadata file
# create json from original metadata file
json_metadata = {} json_metadata = {}
with open('/esarchive/obs/ghost/CAPMoN/metadata/network_provided/CAPMoN_META.csv', 'r', encoding='ISO-8859-1') as file: with open('/esarchive/obs/ghost/CAPMoN/metadata/network_provided/CAPMoN_META.csv', 'r', encoding='ISO-8859-1') as file:
csv_filedata = csv.DictReader(file) csv_filedata = csv.DictReader(file)
for row in csv_filedata: for row in csv_filedata:
key = row['SiteName_NomDuSite'] key = row['ID']+'_'+row['Measurements_Mesures'].replace('"', '')[:3]
update_date = today.strftime('%Y-%m-%d') update_date = today.strftime('%Y-%m-%d')
for parameter in row: for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row json_metadata[key] = row
with open('/esarchive/obs/ghost/CAPMoN/metadata/processed/CAPMoN_META.json', 'w', encoding='utf-8') as f: with open('/esarchive/obs/ghost/CAPMoN/metadata/processed/CAPMoN_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))""" f.write(json.dumps(json_metadata, indent=4))
# create json in desired shape from current metadata file # create json in desired shape from current metadata file
...@@ -193,7 +194,7 @@ def download_metadata(n_max_tries, max_time_per_dl): ...@@ -193,7 +194,7 @@ def download_metadata(n_max_tries, max_time_per_dl):
csv_filedata = csv.DictReader(file) csv_filedata = csv.DictReader(file)
for row in csv_filedata: for row in csv_filedata:
key = row['SiteName_NomDuSite'] key = row['ID']+'_'+row['Measurements_Mesures'].replace('"', '')[:3]
update_date = today.strftime('%Y-%m-%d') update_date = today.strftime('%Y-%m-%d')
for parameter in row: for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
...@@ -218,11 +219,18 @@ def download_metadata(n_max_tries, max_time_per_dl): ...@@ -218,11 +219,18 @@ def download_metadata(n_max_tries, max_time_per_dl):
print('Station {} was abolished'.format(station)) print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations for station in json_metadata_now: # loop through all the new stations
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before pass # comparison was done before
else: # new station appeared! else: # new station appeared!
print('New station {}'.format(station)) print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]}) json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe # safe
......
...@@ -5,10 +5,11 @@ from selenium.webdriver.common.by import By ...@@ -5,10 +5,11 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import requests import requests
import time import time
...@@ -23,14 +24,16 @@ import zipfile ...@@ -23,14 +24,16 @@ import zipfile
import shutil import shutil
import os import os
import re import re
import csv
import json
def scraper(mode, version): def download_data(mode, version, n_max_tries, max_time_per_dl):
# paths and variables # paths and variables
variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu'] # complete list later variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu']
variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre'] variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre']
time_resolutions_website = ["registro diario", "registro horario"] # complete later time_resolutions_website = ["registro diario", "registro horario"]
variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website! variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website!
time_resolution_ghost = ['daily', 'hourly'] time_resolution_ghost = ['daily', 'hourly']
...@@ -39,15 +42,28 @@ def scraper(mode, version): ...@@ -39,15 +42,28 @@ def scraper(mode, version):
baseurl = 'https://sinca.mma.gob.cl/index.php/' baseurl = 'https://sinca.mma.gob.cl/index.php/'
# only for nrt if mode == 'all':
bdate = "240101" # create download directory
os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)
elif mode == 'nrt':
bdate = date(date.today().year, 1, 1).strftime('%Y%m%d')[2:] #"240101"
edate = date.today().strftime('%Y%m%d')[2:] edate = date.today().strftime('%Y%m%d')[2:]
print(edate) print(edate)
# create download directory # create download directory
version = mode
os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True) os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version) download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)
else:
print('time mode inapplicable')
n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
try:
# set up driver # set up driver
options = Options() options = Options()
prefs = {'download.default_directory' : download_location} prefs = {'download.default_directory' : download_location}
...@@ -57,12 +73,14 @@ def scraper(mode, version): ...@@ -57,12 +73,14 @@ def scraper(mode, version):
options.add_argument("--disable-extensions") options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu") options.add_argument("--disable-gpu")
options.add_argument("--disable-dev-shm-usage") options.add_argument("--disable-dev-shm-usage")
if n_tries > 0:
options.add_argument("--headless")
svc = webdriver.ChromeService(executable_path=binary_path) svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options) driver = webdriver.Chrome(service=svc, options=options)
# open url
driver.get(baseurl) driver.get(baseurl)
WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded
# navigate to regions # navigate to regions
html = driver.page_source html = driver.page_source
...@@ -145,7 +163,7 @@ def scraper(mode, version): ...@@ -145,7 +163,7 @@ def scraper(mode, version):
driver.switch_to.frame("right") driver.switch_to.frame("right")
time.sleep(10) time.sleep(10)
WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.LINK_TEXT, "Excel CSV"))) # wait till loaded WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.LINK_TEXT, "Excel CSV"))) # wait till loaded
driver.find_element(By.LINK_TEXT, "Excel CSV").click() driver.find_element(By.LINK_TEXT, "Excel CSV").click()
...@@ -162,43 +180,71 @@ def scraper(mode, version): ...@@ -162,43 +180,71 @@ def scraper(mode, version):
driver.switch_to.default_content() driver.switch_to.default_content()
driver.switch_to.frame("left") driver.switch_to.frame("left")
i=i+1 i=i+1
driver.close()
errcode = 200
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
if n_tries == n_max_tries:
print('Failed downloading CHILE_SINCA data {} times in {} seconds'.format(n_tries, max_time_per_dl))
driver.close()
def scraper_metadata(mode, version): def download_metadata(n_max_tries, max_time_per_dl):
# paths and variables
variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu']
variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre']
time_resolutions_website = ["registro diario", "registro horario"]
variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website!
time_resolution_ghost = ['daily', 'hourly']
baseurl = 'https://sinca.mma.gob.cl/index.php/' baseurl = 'https://sinca.mma.gob.cl/index.php/'
today = date.today()
metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/temp/CHILE_SINCA_META_temp.txt")
print(metadata15)
metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/temp/CHILE_SINCA_META_temp.txt")
#print(metadata15)
instruments_old = ['O3_instrument','NO_instrument','NO2_instrument','CO_instrument','CH4_instrument','SO2_instrument','NMHC_instrument','HC_instrument','PM10_instrument','PM2.5_instrument','As_instrument','Cu_instrument','Pb_instrument']
"""
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'r', encoding='utf-8') as f:
metadata_old = json.loads(f.read())
n_tries = 0
errcode = 999
metadata_new = {}
while (n_tries < n_max_tries) and (errcode != 200):
try:
# set up driver # set up driver
options = Options() options = Options()
#prefs = {'download.default_directory' : download_location} #prefs = {'download.default_directory' : download_location}
#options.add_experimental_option('prefs', prefs) #options.add_experimental_option('prefs', prefs)
options.add_argument("--no-sandbox") options.add_argument("--no-sandbox")
options.add_argument("--headless")
svc = webdriver.ChromeService(executable_path=binary_path) svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options) driver = webdriver.Chrome(service=svc, options=options)
# open url # open url
driver.get(baseurl) driver.get(baseurl)
WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded
# navigate to regions # navigate to regions
html = driver.page_source html = driver.page_source
soup = BeautifulSoup(html, features="html.parser") soup = BeautifulSoup(html, features="html.parser")
regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/")) regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta #regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
for region in regions: for region in regions:
print("Region is "+region.getText()) print("Region is "+region.getText())
driver.get("https://sinca.mma.gob.cl/"+region.get("href")) driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
time.sleep(3) time.sleep(1)
html = driver.page_source html = driver.page_source
soup = BeautifulSoup(html, features="html.parser") soup = BeautifulSoup(html, features="html.parser")
...@@ -207,8 +253,8 @@ def scraper_metadata(mode, version): ...@@ -207,8 +253,8 @@ def scraper_metadata(mode, version):
for station in stations: for station in stations:
station_name = station.getText() station_name_new = station.getText()
print(station_name) print(station_name_new)
driver.get("https://sinca.mma.gob.cl/"+station.get("href")) driver.get("https://sinca.mma.gob.cl/"+station.get("href"))
time.sleep(3) time.sleep(3)
...@@ -220,29 +266,121 @@ def scraper_metadata(mode, version): ...@@ -220,29 +266,121 @@ def scraper_metadata(mode, version):
region = soup.find("th", text="Región").find_next_sibling().getText() region = soup.find("th", text="Región").find_next_sibling().getText()
province = soup.find("th", text="Provincia").find_next_sibling().getText() province = soup.find("th", text="Provincia").find_next_sibling().getText()
commune = soup.find("th", text="Comuna").find_next_sibling().getText() commune = soup.find("th", text="Comuna").find_next_sibling().getText()
UTM_coordinates = soup.find("th", text="Coordenadas UTM").find_next_sibling().getText() UTM_coordinates = soup.find("th", text="Coordenadas UTM").find_next_sibling().getText().replace(' ', '')
timezone = soup.find("th", text="Huso horario").find_next_sibling().getText() lon = UTM_coordinates.split('E')[0]+'E'
lat = UTM_coordinates.split('E')[1].split("\n")[0]
scraped_metadata = [station_reference, station_name, region, province, commune, UTM_coordinates, timezone] timezone = soup.find("th", text="Huso horario").find_next_sibling().getText().replace(' ', '')
ins_table = soup.find('table', id="medicion")
if ins_table is not None: # check if there are instruments for air pollution at this station
instruments = ins_table.find_all("td", {"class": "helpTecnica center"})
instruments_per_component = {}
else:
continue
metadata15_per_station = metadata15.loc[metadata15["station_name"] == station_name] for instrument in instruments:
print(region) component = instrument.find_parent().find('a').getText()
print(metadata15_per_station) try: # rename
print(metadata15_per_station["region"].iloc[0]) component = variables_ghost[variables_text.index(component)]
except:
try:
component = variables_ghost[variables_website.index(component)]
except:
pass
i=0 if 'Ozono.-' in component:
for column in metadata15_per_station.head(): component = 'O3'
print(column)
if metadata15_per_station[column].iloc[0] == scraped_metadata[i]: #======
print("ok!") if "No informado" in instrument.getText():
instruments_per_component[component] = ''
else: else:
print("not ok") instrument_name = re.sub(' +', ' ', instrument.getText())
instrument_name = instrument_name.split("\n")[-1]
instruments_per_component[component] = instrument_name
for station_reference in metadata_old:
if metadata_old[station_reference]['station_name']['values'][0] == station_name_new: # match station with previously referenced station reference from old file
i=0
metadata_new[station_reference] = {} # create inner dictionary
scraped_metadata = [station_reference, station_name_new, region, province, commune, lon, lat, timezone]
for parameter in metadata_old[station_reference]: # loop through the meta parameters
if ("instrument" not in parameter) and ("comments" not in parameter): # go through all that are not instruments
metadata_new[station_reference][parameter] = {"values": [scraped_metadata[i]], "update_time": [today.strftime('%Y-%m-%d')]}
i=i+1 i=i+1
elif "comments" == parameter:
metadata_new[station_reference][parameter] = {"values": [''], "update_time": [today.strftime('%Y-%m-%d')]}
else: # go through the instruments
for component in instruments_per_component:
if component in parameter:
metadata_new[station_reference][parameter] = {"values": [instruments_per_component[component]], "update_time": [today.strftime('%Y-%m-%d')]}
else:
metadata_new[station_reference][parameter] = {"values": [''], "update_time": [today.strftime('%Y-%m-%d')]}
# safe
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META_{}.json'.format(today.strftime('%Y%m%d')), 'w', encoding='utf-8') as f:
f.write(json.dumps(metadata_new, indent=4, ensure_ascii=False))
driver.close()
errcode = 200
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
if n_tries == n_max_tries:
print('Failed downloading CHILE_SINCA metadata {} times in {} seconds'.format(n_tries, max_time_per_dl))"""
"""
# create json from original metadata file =====================================================================================
json_metadata = {}
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META.txt', 'r') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['station_reference']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
driver.close() with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# read newly scraped file
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META_{}.json'.format(today.strftime('%Y%m%d')), 'r', encoding='utf-8') as f:
json_metadata_now = json.loads(f.read())
# read standardised file to compare!
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'r', encoding='utf-8') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# safe
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
\ No newline at end of file
...@@ -15,14 +15,30 @@ import zipfile ...@@ -15,14 +15,30 @@ import zipfile
import os.path import os.path
import os import os
import pandas as pd
def scraper(mode, version): def download_data(mode, version, n_max_tries, max_time_per_dl):
url = 'https://monitoring.eanet.asia/document/menu/index#publicData' url = 'https://monitoring.eanet.asia/document/menu/index#publicData'
download_location = "/esarchive/obs/ghost/EANET/original_files/{}/".format(version)
today = date.today().strftime('%Y%m%d') #+ timedelta(days = 1)).strftime('%Y%m%d') # problem with timezones??? today = date.today().strftime('%Y%m%d') #+ timedelta(days = 1)).strftime('%Y%m%d') # problem with timezones???
#print(today)
if mode == 'all':
bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts
edate = date.today()
os.makedirs('/esarchive/obs/ghost/EANET/original_files/{}/'.format(version), exist_ok=True)
download_location = "/esarchive/obs/ghost/EANET/original_files/{}/".format(version)
elif mode == 'nrt':
print("EANET no nrt")
exit()
else:
print('time mode inapplicable')
exit()
options = Options() options = Options()
prefs = {'download.default_directory' : download_location} prefs = {'download.default_directory' : download_location}
...@@ -32,17 +48,17 @@ def scraper(mode, version): ...@@ -32,17 +48,17 @@ def scraper(mode, version):
driver = webdriver.Chrome(service=svc, options=options) driver = webdriver.Chrome(service=svc, options=options)
driver.get(url) driver.get(url)
time.sleep(2) time.sleep(max_time_per_dl)
# login # login
email = driver.find_element(By.ID, "email") email = driver.find_element(By.ID, "email")
email.send_keys("raphael.grodofzig@bsc.es") email.send_keys("raphael.grodofzig@bsc.es")
passwd = driver.find_element(By.ID, "passwd") passwd = driver.find_element(By.ID, "passwd")
passwd.send_keys("274s9QZ5") passwd.send_keys("274s9QZ5")
time.sleep(2) time.sleep(max_time_per_dl)
driver.find_element(By.NAME, "submitBtn").click() driver.find_element(By.NAME, "submitBtn").click()
time.sleep(3) time.sleep(max_time_per_dl)
# find countries # find countries
dropdown_element = driver.find_element(By.ID, 'countryCd') dropdown_element = driver.find_element(By.ID, 'countryCd')
...@@ -105,3 +121,108 @@ def scraper(mode, version): ...@@ -105,3 +121,108 @@ def scraper(mode, version):
i=i+1 i=i+1
driver.close() driver.close()
def download_metadata(n_max_tries, max_time_per_dl):
url_metadata = 'https://www.eanet.asia/wp-content/uploads/2024/01/Site_Information_Acid_Deposition_Monitoring_NMP2023_1117.xlsm'
download_location = "/esarchive/obs/ghost/EANET/metadata/network_provided/EANET_META_{}.csv"
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url_metadata, timeout=max_time_per_dl, headers=Headers)
if r.status_code == 200:
with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
metadata = pd.read_excel(download_location.format(today.strftime('%Y%m%d')), engine='pyxlsb').fillna('')
print(metadata)
"""
# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/EANET/metadata/network_provided/EANET_META.csv', 'r', encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'r', encoding='ISO-8859-1') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('{} not in new metadata file'.format(parameter))
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))"""
\ No newline at end of file
import requests
import time
from datetime import date
from datetime import timedelta
import zipfile
import urllib
import os.path
import os
import pandas as pd
def scraper(mode):
url = 'https://monitoring.eanet.asia/document/menu/index#publicData'
download_url = 'https://www.eanet.asia/wp-content/uploads/2024/01/Site_Information_Acid_Deposition_Monitoring_NMP2023_1117.xlsm'
download_location = "/esarchive/obs/ghost/EANET/metadata/network_provided/"
today = date.today().strftime('%Y%m%d') #+ timedelta(days = 1)).strftime('%Y%m%d') # problem with timezones???
"""
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
r = requests.get(download_url, timeout=120, headers=Headers)
print(r.status_code)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+"downloaded_metadata.xlsm")
print('Downloaded metadata')
else:
print('url status not ok')"""
# open file
metadata = pd.read_excel(download_location+"downloaded_metadata.xlsm", engine='pyxlsb').fillna('')
print(metadata)
...@@ -8,20 +8,20 @@ import urllib ...@@ -8,20 +8,20 @@ import urllib
import tarfile import tarfile
import shutil import shutil
import gzip import gzip
import csv
import json
def scraper(mode, version): def download_data(mode, version, n_max_tries, max_time_per_dl):
base_url = 'http://www.aire.cdmx.gob.mx/opendata/anuales_horarios_gz/contaminantes_{}.csv.gz' base_url = 'http://www.aire.cdmx.gob.mx/opendata/anuales_horarios_gz/contaminantes_{}.csv.gz'
if mode == 'all': if mode == 'all':
bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts
edate = date.today() edate = date.today()
os.makedirs('/esarchive/obs/ghost/MEXICO_CDMX/original_files/{}/'.format(version), exist_ok=True) os.makedirs('/esarchive/obs/ghost/MEXICO_CDMX/original_files/{}/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/MEXICO_CDMX/original_files/'+version+'/contaminantes_{}.csv.gz' download_location = '/esarchive/obs/ghost/MEXICO_CDMX/original_files/'+version+'/contaminantes_{}.csv.gz'
elif mode == 'nrt': elif mode == 'nrt':
...@@ -39,21 +39,143 @@ def scraper(mode, version): ...@@ -39,21 +39,143 @@ def scraper(mode, version):
# download # download
for year in years: for year in years:
url = base_url.format(year) url = base_url.format(year)
r = requests.get(url, timeout=120) n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url, timeout=max_time_per_dl)
if r.status_code == 200: if r.status_code == 200:
urllib.request.urlretrieve(url, download_location.format(year)) urllib.request.urlretrieve(url, download_location.format(year))
print('Downloaded {}'.format(url)) print('Downloaded {}'.format(url))
# unzip # unzip
with gzip.open(download_location.format(year), 'rb') as f_in: with gzip.open(download_location.format(year), 'rb') as f_in:
with open(download_location.format(year)[:-3], 'wb') as f_out: with open(download_location.format(year)[:-3], 'wb') as f_out:
shutil.copyfileobj(f_in, f_out) shutil.copyfileobj(f_in, f_out)
# remove files # remove files
os.remove(download_location.format(year)) os.remove(download_location.format(year))
errcode = r.status_code
elif r.status_code == 404:
print("No data found, error 404, year {}".format(year))
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(year))
errcode = 200
else: else:
print('No {}'.format(url)) # try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1)
def download_metadata(n_max_tries, max_time_per_dl):
url_metadata = 'http://www.aire.cdmx.gob.mx/opendata/catalogos/cat_estacion.csv'
download_location = "/esarchive/obs/ghost/MEXICO_CDMX/metadata/network_provided/MEXICO_CDMX_META_{}.csv"
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url_metadata, timeout=max_time_per_dl)
if r.status_code == 200:
with open(download_location.format('_unformatted'), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1) time.sleep(1)
# import it as pandas to clean header
meta_file = pd.read_csv(download_location.format('_unformatted'), header=[1], encoding='ISO-8859-1')
meta_file.to_csv(download_location.format(today.strftime('%Y%m%d')), index=False)
os.remove(download_location.format('_unformatted'))
# create json from original metadata file
"""json_metadata = {}
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/network_provided/MEXICO_CDMX_META.csv', 'r', encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/processed/MEXICO_CDMX_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/processed/MEXICO_CDMX_META.json', 'r', encoding='ISO-8859-1') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('{} not in new metadata file'.format(parameter))
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/processed/MEXICO_CDMX_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
\ No newline at end of file
...@@ -22,22 +22,18 @@ from selenium.webdriver.support import expected_conditions as EC ...@@ -22,22 +22,18 @@ from selenium.webdriver.support import expected_conditions as EC
def scraper(mode, version): def download_data(mode, version, n_max_tries, max_time_per_dl):
baseurl = 'https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos-2001-2021.html' baseurl = 'https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos-2001-2021.html'
if mode == 'all': if mode == 'all':
bdate = date(2001, 1, 1) #date(1960, 1, 1) # date before record starts bdate = date(2001, 1, 1) #date(1960, 1, 1) # date before record starts
edate = date.today() edate = date.today()
os.makedirs('/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version), exist_ok=True) os.makedirs('/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version) download_location = '/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version)
elif mode == 'nrt': elif mode == 'nrt':
bdate = date(2024, 3, 2) #date.today() - timedelta(days = 1) # if code is run after 2 am, data from previous day will be available print("nrt not available")
edate = date(2024, 3, 3) #date.today() - timedelta(days = 1)
download_location = '/esarchive/obs/ghost/MITECO/original_files/nrt/' download_location = '/esarchive/obs/ghost/MITECO/original_files/nrt/'
else: else:
...@@ -70,23 +66,40 @@ def scraper(mode, version): ...@@ -70,23 +66,40 @@ def scraper(mode, version):
for zip_link in zip_links: for zip_link in zip_links:
filename = zip_link.get("href").rpartition('/')[-1] filename = zip_link.get("href").rpartition('/')[-1]
url = 'https://www.miteco.gob.es/{}'.format(zip_link.get("href")) url = 'https://www.miteco.gob.es/{}'.format(zip_link.get("href"))
n_tries = 0
errcode = 999
r = requests.get(url, timeout=120) while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url, timeout=max_time_per_dl)
if r.status_code == 200: if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+filename) urllib.request.urlretrieve(url, download_location+filename)
print('Downloaded {}'.format(filename)) print('Downloaded {}'.format(filename))
# unzip # unzip
with zipfile.ZipFile(download_location+filename, 'r') as zip_ref: with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
zip_ref.extractall(download_location) zip_ref.extractall(download_location)
os.remove(download_location+filename) os.remove(download_location+filename)
else: errcode = r.status_code
print('No {}'.format(url))
time.sleep(1) elif r.status_code == 404:
print("No data found, error 404")
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(url))
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# go to hyperlinks # go to hyperlinks
...@@ -119,7 +132,11 @@ def scraper(mode, version): ...@@ -119,7 +132,11 @@ def scraper(mode, version):
continue continue
r = requests.get(url, timeout=120) n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url, timeout=max_time_per_dl)
if r.status_code == 200: if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+filename) urllib.request.urlretrieve(url, download_location+filename)
print('Downloaded {}'.format(filename)) print('Downloaded {}'.format(filename))
...@@ -129,10 +146,26 @@ def scraper(mode, version): ...@@ -129,10 +146,26 @@ def scraper(mode, version):
zip_ref.extractall(download_location) zip_ref.extractall(download_location)
os.remove(download_location+filename) os.remove(download_location+filename)
errcode = r.status_code
else: elif r.status_code == 404:
print('No {}'.format(url)) print("No data found, error 404")
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(url))
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1) time.sleep(1)
# delete metadata # delete metadata
...@@ -152,3 +185,96 @@ def scraper(mode, version): ...@@ -152,3 +185,96 @@ def scraper(mode, version):
driver.close() driver.close()
def download_metadata(n_max_tries, max_time_per_dl):
url_metadata = 'https://www.miteco.gob.es/content/dam/miteco/es/calidad-y-evaluacion-ambiental/sgalsi/atm%C3%B3sfera-y-calidad-del-aire/evaluaci%C3%B3n-2022/Metainformacion2022.xlsx'
download_location = "/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META_{}.xlsx"
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
r = requests.get(url_metadata, headers=Headers, timeout=max_time_per_dl)
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
if r.status_code == 200:
with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# convert to csv
file = pd.read_excel(download_location.format(today.strftime('%Y%m%d')))
file.to_csv('/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META_{}.csv'.format(today.strftime('%Y%m%d')), index=False, header=True)
"""# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META.csv', 'r', encoding='ISO-8859-1') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['SiteName_NomDuSite']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='ISO-8859-1') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['SiteName_NomDuSite']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'r', encoding='ISO-8859-1') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1]), json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# safe
with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'w', encoding='ISO-8859-1') as f:
f.write(json.dumps(json_metadata, indent=4))"""
...@@ -7,20 +7,21 @@ import re ...@@ -7,20 +7,21 @@ import re
import os import os
from datetime import date from datetime import date
from datetime import timedelta from datetime import timedelta
import requests
import csv
import json
import time
def scraper(mode, version): def download_data(mode, version, n_max_tries, max_time_per_dl):
if mode == 'all': if mode == 'all':
start_year = 1971 start_year = 1971
end_year = 2024 end_year = 2024
elif mode == 'nrt': elif mode == 'nrt':
start_year = date.today().strftime('%Y') start_year = date.today().strftime('%Y')
end_year = (date.today() + timedelta(days=365)).strftime('%Y') end_year = (date.today() + timedelta(days=365)).strftime('%Y')
version = mode version = mode
else: else:
...@@ -36,7 +37,7 @@ def scraper(mode, version): ...@@ -36,7 +37,7 @@ def scraper(mode, version):
read_url = False read_url = False
while read_url == False: while read_url == False:
try: try:
link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=15, cafile=certifi.where()).read().decode('utf-8-sig')) link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=max_time_per_dl, cafile=certifi.where()).read().decode('utf-8-sig'))
read_url = True read_url = True
except HTTPError as error: except HTTPError as error:
print('Data not retrieved because %s\nURL: %s'%(error, link_url)) print('Data not retrieved because %s\nURL: %s'%(error, link_url))
...@@ -57,11 +58,10 @@ def scraper(mode, version): ...@@ -57,11 +58,10 @@ def scraper(mode, version):
#handles issue of server hanging for 3 minutes spoaradically #handles issue of server hanging for 3 minutes spoaradically
#try downloading each link a certain number of times before giving up #try downloading each link a certain number of times before giving up
n_tries_limit = 3
for link in link_list: for link in link_list:
n_tries = 0 n_tries = 0
errcode = 999 errcode = 999
while (n_tries < n_tries_limit) & (errcode != 0): while (n_tries < n_max_tries) & (errcode != 0):
if n_tries == 0: if n_tries == 0:
print('Checking/Downloading %s'%(link)) print('Checking/Downloading %s'%(link))
else: else:
...@@ -92,3 +92,102 @@ def scraper(mode, version): ...@@ -92,3 +92,102 @@ def scraper(mode, version):
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True) process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
status = process.communicate()[0] status = process.communicate()[0]
errcode = process.returncode errcode = process.returncode
def download_metadata(n_max_tries, max_time_per_dl):
url_metadata = 'https://www.ncei.noaa.gov/pub/data/noaa/isd-history.csv'
download_location = "/esarchive/obs/ghost/NOAA_ISD/metadata/network_provided/NOAA_ISD_META_{}.csv"
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url_metadata, timeout=max_time_per_dl)
if r.status_code == 200:
with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
"""
# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/network_provided/NOAA_ISD_META.csv', 'r', encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['USAF']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['USAF']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'r', encoding='utf-8') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('{} not in new metadata file'.format(parameter))
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
\ No newline at end of file
...@@ -47,15 +47,14 @@ def download_data(mode, version, n_max_tries, max_time_per_dl): ...@@ -47,15 +47,14 @@ def download_data(mode, version, n_max_tries, max_time_per_dl):
options.add_argument("--no-sandbox") options.add_argument("--no-sandbox")
#options.add_argument("--headless") #options.add_argument("--headless")
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
n_tries = 0 n_tries = 0
errcode = 999 errcode = 999
while (n_tries < n_max_tries) and (errcode != 200): while (n_tries < n_max_tries) and (errcode != 200):
try: try:
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
# open url # open url
driver.get(baseurl) driver.get(baseurl)
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded
...@@ -110,9 +109,6 @@ def download_data(mode, version, n_max_tries, max_time_per_dl): ...@@ -110,9 +109,6 @@ def download_data(mode, version, n_max_tries, max_time_per_dl):
if n_tries == n_max_tries: if n_tries == n_max_tries:
print('Failed downloading US_NADP_AMNet data {} times in {} seconds'.format(n_tries, max_time_per_dl)) print('Failed downloading US_NADP_AMNet data {} times in {} seconds'.format(n_tries, max_time_per_dl))
print(os.path.split(download_location[:-5]))
os.rename("{}AMNET-ALL-h.csv".format(download_location), "/esarchive/obs/ghost/US_NADP_AMNet/original_files/{}/AMNET-ALL-h.csv".format(version)) os.rename("{}AMNET-ALL-h.csv".format(download_location), "/esarchive/obs/ghost/US_NADP_AMNet/original_files/{}/AMNET-ALL-h.csv".format(version))
...@@ -229,7 +225,7 @@ def download_metadata(n_max_tries, max_time_per_dl): ...@@ -229,7 +225,7 @@ def download_metadata(n_max_tries, max_time_per_dl):
for parameter in json_metadata[station]: for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file # if different value, append the standardised metadeta file
print("new {} --- old {}".format(json_metadata_now[station][parameter]['values'][0], json_metadata[station][parameter]['values'][-1])) print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0]) json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0]) json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else: else:
......
...@@ -8,9 +8,9 @@ import pandas ...@@ -8,9 +8,9 @@ import pandas
import os.path import os.path
import urllib import urllib
import time import time
import ssl
import zipfile import zipfile
from compare_two_files import compare_files import json
import csv
from chromedriver_py import binary_path from chromedriver_py import binary_path
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
...@@ -22,22 +22,16 @@ from selenium.webdriver.support import expected_conditions as EC ...@@ -22,22 +22,16 @@ from selenium.webdriver.support import expected_conditions as EC
def scraper(mode, version): def download_data(mode, version, n_max_tries, max_time_per_dl):
baseurl = 'https://nadp.slh.wisc.edu/networks/ammonia-monitoring-network/' baseurl = 'https://nadp.slh.wisc.edu/networks/ammonia-monitoring-network/'
if mode == 'all': if mode == 'all':
bdate = date(2013, 12, 1) #date(1960, 1, 1) # date before record starts os.makedirs('/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/temp/'.format(version), exist_ok=True)
edate = date(2024, 1, 1) #date.today() - timedelta(days = 1) download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/temp/'.format(version)
os.makedirs('/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/'.format(version)
elif mode == 'nrt': elif mode == 'nrt':
bdate = date(2024, 1, 1) #date.today() - timedelta(days = 1) download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/nrt/temp/'
edate = date.today() - timedelta(days = 1)
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/nrt/'
else: else:
print('time mode inapplicable') print('time mode inapplicable')
...@@ -52,29 +46,30 @@ def scraper(mode, version): ...@@ -52,29 +46,30 @@ def scraper(mode, version):
svc = webdriver.ChromeService(executable_path=binary_path) svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options) driver = webdriver.Chrome(service=svc, options=options)
n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
try:
# open url # open url
driver.get(baseurl) driver.get(baseurl)
WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded
dropdown_element = driver.find_element(By.ID, 'data-type') dropdown_element = driver.find_element(By.ID, 'data-type')
select = Select(dropdown_element) select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options] options = [opt.get_attribute("text") for opt in select.options]
print(options)
select.select_by_visible_text("Bi-weekly") select.select_by_visible_text("Bi-weekly")
time.sleep(3) time.sleep(max_time_per_dl)
dropdown_element = driver.find_element(By.ID, 'sites-list') dropdown_element = driver.find_element(By.ID, 'sites-list')
select = Select(dropdown_element) select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options] options = [opt.get_attribute("text") for opt in select.options]
print(options)
select.select_by_visible_text("All Sites") select.select_by_visible_text("All Sites")
time.sleep(3) time.sleep(max_time_per_dl)
invalid_box = driver.find_element(By.ID, 'invalid') invalid_box = driver.find_element(By.ID, 'invalid')
invalid_box.click() invalid_box.click()
time.sleep(3) time.sleep(max_time_per_dl)
# download # download
driver.find_element(By.ID, 'generate-button-text').click() driver.find_element(By.ID, 'generate-button-text').click()
...@@ -85,5 +80,164 @@ def scraper(mode, version): ...@@ -85,5 +80,164 @@ def scraper(mode, version):
if os.path.isfile("{}AMoN-ALL-W-i.csv".format(download_location)): if os.path.isfile("{}AMoN-ALL-W-i.csv".format(download_location)):
print('AMoN-ALL-W-i.csv download successful') print('AMoN-ALL-W-i.csv download successful')
errcode = 200
continue
except TimeoutException as e:
print(e)
max_time_per_dl = max_time_per_dl*2 # set waiting time to double
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
except:
print("Unknown error")
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
driver.close() driver.close()
if n_tries == n_max_tries:
print('Failed downloading US_NADP_AMoN data {} times in {} seconds'.format(n_tries, max_time_per_dl))
os.rename("{}AMoN-ALL-W-i.csv".format(download_location), "/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/AMoN-ALL-W-i.csv".format(version))
def download_metadata(n_max_tries, max_time_per_dl):
baseurl = 'https://nadp.slh.wisc.edu/networks/ammonia-monitoring-network/'
#os.makedirs('/esarchive/obs/ghost/US_NADP_AMNet/metadata/network_provided/', exist_ok=True)
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/metadata/network_provided/'
today = date.today()
# set up driver
options = Options()
prefs = {'download.default_directory' : download_location}
options.add_experimental_option('prefs', prefs)
options.add_argument("--no-sandbox")
options.add_argument("--headless")
n_tries = 0
errcode = 999
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
driver.maximize_window()
while (n_tries < n_max_tries) and (errcode != 200):
try:
# open url
driver.get(baseurl)
#WebDriverWait(driver, max_time_per_dl).until(EC.element_to_be_clickable((By.ID, 'invalid'))) # wait till loaded
time.sleep(max_time_per_dl)
invalid_box = driver.find_element(By.ID, 'download-show-inactive')
driver.execute_script("arguments[0].click()", invalid_box)
# download
#WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'generate-button-text'))) # wait till loaded
time.sleep(max_time_per_dl)
bttn = driver.find_element(By.ID, 'network-data-submit')
driver.execute_script("arguments[0].click()", bttn)
# wait until download finished
while not os.path.exists(download_location+'amon.csv'):
time.sleep(1)
if os.path.isfile(download_location+'amon.csv'):
print('Amon metadata download successful')
errcode = 200
continue
except TimeoutException as e:
print(e)
max_time_per_dl = max_time_per_dl*2 # set waiting time to double
n_tries = n_tries+1
continue
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
except:
print("Unknown error")
max_time_per_dl = max_time_per_dl*2 # set waiting time to double
n_tries = n_tries+1
continue
if n_tries == n_max_tries:
print('Failed downloading AMoN metadata {} times in {} seconds'.format(n_tries, max_time_per_dl))
driver.close()
os.rename(download_location+'amon.csv', download_location+'US_NADP_AMoN_META_{}.csv'.format(today.strftime('%Y%m%d')))
# create json from original metadata file =====================================================================================
"""json_metadata = {}
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/network_provided/US_NADP_AMoN_META.csv', 'r') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['siteId']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/processed/US_NADP_AMoN_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location+'US_NADP_AMoN_META_{}.csv'.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['siteId']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/processed/US_NADP_AMoN_META.json', 'r', encoding='utf-8') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# safe
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/processed/US_NADP_AMoN_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
import hashlib # works for all type of data import hashlib # works for all type of data
import requests
def request_download(url, max_time_per_dl, download_location):
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
n_tries = 0
errcode = 999
r = requests.get(url, headers=Headers, timeout=max_time_per_dl)
while (n_tries < n_max_tries) and (errcode != 200):
if r.status_code == 200:
open(download_location + file.format(year), "wb").write(r.content)
print('Downloaded {}'.format(file.format(year)))
errcode = r.status_code
elif r.status_code == 404:
print("No ozone l data found, error 404")
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(file.format(year)))
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# check if files are different # check if files are different
......
...@@ -61,7 +61,13 @@ if __name__ == "__main__": ...@@ -61,7 +61,13 @@ if __name__ == "__main__":
'CNEMC': {'max_time_dl': 3}, 'CNEMC': {'max_time_dl': 3},
'CANADA_NAPS': {'max_time_dl': 5}, 'CANADA_NAPS': {'max_time_dl': 5},
'CAPMoN': {'max_time_dl': 5}, 'CAPMoN': {'max_time_dl': 5},
'US_NADP_AMNet': {'max_time_dl': 10}} 'US_NADP_AMNet': {'max_time_dl': 10},
'US_NADP_AMoN': {'max_time_dl': 7},
'MEXICO_CDMX': {'max_time_dl': 10},
'NOAA_ISD': {'max_time_dl': 15},
'MITECO': {'max_time_dl': 10},
'EANET': {'max_time_dl': 5},
'CHILE_SINCA': {'max_time_dl': 30}}
# download data # download data
...@@ -71,7 +77,7 @@ if __name__ == "__main__": ...@@ -71,7 +77,7 @@ if __name__ == "__main__":
dl_metadata = True dl_metadata = True
# networks you want to download # networks you want to download
networks = [US_NADP_AMNet_download] networks = [CHILE_SINCA_download]
# download all networks # download all networks
#networks = ['all'] #networks = ['all']
...@@ -107,8 +113,6 @@ if __name__ == "__main__": ...@@ -107,8 +113,6 @@ if __name__ == "__main__":
if dl_data == True: if dl_data == True:
network.download_data(mode, version, n_max_tries, max_time_per_dl) network.download_data(mode, version, n_max_tries, max_time_per_dl)
pass
if dl_metadata == True: if dl_metadata == True:
network.download_metadata(n_max_tries, max_time_per_dl) network.download_metadata(n_max_tries, max_time_per_dl)
pass