Commits (2)
......@@ -171,20 +171,21 @@ def download_metadata(n_max_tries, max_time_per_dl):
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
"""# create json from original metadata file
# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/CAPMoN/metadata/network_provided/CAPMoN_META.csv', 'r', encoding='ISO-8859-1') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['SiteName_NomDuSite']
key = row['ID']+'_'+row['Measurements_Mesures'].replace('"', '')[:3]
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/CAPMoN/metadata/processed/CAPMoN_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))"""
f.write(json.dumps(json_metadata, indent=4))
# create json in desired shape from current metadata file
......@@ -193,7 +194,7 @@ def download_metadata(n_max_tries, max_time_per_dl):
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['SiteName_NomDuSite']
key = row['ID']+'_'+row['Measurements_Mesures'].replace('"', '')[:3]
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
......@@ -218,11 +219,18 @@ def download_metadata(n_max_tries, max_time_per_dl):
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
......
......@@ -5,10 +5,11 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
import requests
import time
......@@ -23,14 +24,16 @@ import zipfile
import shutil
import os
import re
import csv
import json
def scraper(mode, version):
def download_data(mode, version, n_max_tries, max_time_per_dl):
# paths and variables
variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu'] # complete list later
variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu']
variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre']
time_resolutions_website = ["registro diario", "registro horario"] # complete later
time_resolutions_website = ["registro diario", "registro horario"]
variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website!
time_resolution_ghost = ['daily', 'hourly']
......@@ -39,210 +42,345 @@ def scraper(mode, version):
baseurl = 'https://sinca.mma.gob.cl/index.php/'
# only for nrt
bdate = "240101"
edate = date.today().strftime('%Y%m%d')[2:]
print(edate)
# create download directory
os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)
# set up driver
options = Options()
prefs = {'download.default_directory' : download_location}
options.add_experimental_option('prefs', prefs)
options.add_argument("--no-sandbox")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu")
options.add_argument("--disable-dev-shm-usage")
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
# open url
driver.get(baseurl)
WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded
# navigate to regions
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
for region in regions:
print("Region is "+region.getText())
driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
time.sleep(3)
# navigate to station and component
html_region = driver.page_source
soup_region = BeautifulSoup(html_region, features="html.parser")
a_titles = [a.get("title") for a in soup_region.find_all("a", title=True)] # all links
# from all the links, choose only the components
stations_components = []
for a_title in a_titles:
for variable_text in variables_text:
if variable_text in a_title:
stations_components.append(soup_region.find("a", {"title": a_title}))
# loop through all stations and components of the region
for station_component in stations_components:
print(station_component.get("title"))
station = station_component.get("title").split("| ", 1)[1]
component = [x for x in variables_text if x in station_component.get("title")][0] # get component name on website
component_choose_time_res = variables_website[variables_text.index(component)] # get component name for choosing time resolution
component_ghost = variables_ghost[variables_text.index(component)] # get component name accordingly in ghost
# create storage directory
try:
station_id = metadata15["station_reference"][metadata15["station_name"] == station].iloc[0]
os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id), exist_ok=True)
storage_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id)
except:
print("{} added since 1.5, no station_id found in metadata_1.5, did not download data from new station".format(station))
continue
# go to data on website
driver.get('https:'+station_component.get("href"))
time.sleep(5)
if mode == 'all':
# create download directory
os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)
elif mode == 'nrt':
bdate = date(date.today().year, 1, 1).strftime('%Y%m%d')[2:] #"240101"
edate = date.today().strftime('%Y%m%d')[2:]
print(edate)
# create download directory
version = mode
os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)
else:
print('time mode inapplicable')
n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
try:
# set up driver
options = Options()
prefs = {'download.default_directory' : download_location}
options.add_experimental_option('prefs', prefs)
options.add_argument("--no-sandbox")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu")
options.add_argument("--disable-dev-shm-usage")
if n_tries > 0:
options.add_argument("--headless")
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
driver.get(baseurl)
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded
# navigate to regions
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
driver.switch_to.frame("left")
# select time resolution
dropdown_element = driver.find_element(By.ID, 'ic')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
i=0
for time_resolution in time_resolutions_website:
#select time resolution if existent!
if (component_choose_time_res+' - '+time_resolution) in options:
select.select_by_visible_text(component_choose_time_res+' - '+time_resolution)
#print("Time resolution is: {}".format(time_resolution_ghost[i]))
for region in regions:
print("Region is "+region.getText())
driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
time.sleep(3)
# navigate to station and component
html_region = driver.page_source
soup_region = BeautifulSoup(html_region, features="html.parser")
a_titles = [a.get("title") for a in soup_region.find_all("a", title=True)] # all links
# from all the links, choose only the components
stations_components = []
for a_title in a_titles:
for variable_text in variables_text:
if variable_text in a_title:
stations_components.append(soup_region.find("a", {"title": a_title}))
# loop through all stations and components of the region
for station_component in stations_components:
print(station_component.get("title"))
station = station_component.get("title").split("| ", 1)[1]
component = [x for x in variables_text if x in station_component.get("title")][0] # get component name on website
component_choose_time_res = variables_website[variables_text.index(component)] # get component name for choosing time resolution
component_ghost = variables_ghost[variables_text.index(component)] # get component name accordingly in ghost
# create storage directory
try:
station_id = metadata15["station_reference"][metadata15["station_name"] == station].iloc[0]
os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id), exist_ok=True)
storage_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id)
except:
print("{} added since 1.5, no station_id found in metadata_1.5, did not download data from new station".format(station))
continue
# go to data on website
driver.get('https:'+station_component.get("href"))
time.sleep(5)
driver.switch_to.frame("left")
if mode == "all":
start_date = driver.find_element(By.ID, "from").get_attribute("value")
end_date = driver.find_element(By.ID, "to").get_attribute("value")
if mode == "nrt": # updating dates difficult
start_date = driver.find_element(By.ID, "from")
driver.execute_script("arguments[0].value = {};".format(bdate), start_date)
end_date = driver.find_element(By.ID, "to")
driver.execute_script("arguments[0].value = {};".format(edate), end_date)
time.sleep(10)
# select time resolution
dropdown_element = driver.find_element(By.ID, 'ic')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
i=0
for time_resolution in time_resolutions_website:
#select time resolution if existent!
if (component_choose_time_res+' - '+time_resolution) in options:
select.select_by_visible_text(component_choose_time_res+' - '+time_resolution)
#print("Time resolution is: {}".format(time_resolution_ghost[i]))
time.sleep(5)
driver.switch_to.default_content()
driver.switch_to.frame("right")
if mode == "all":
start_date = driver.find_element(By.ID, "from").get_attribute("value")
end_date = driver.find_element(By.ID, "to").get_attribute("value")
if mode == "nrt": # updating dates difficult
start_date = driver.find_element(By.ID, "from")
driver.execute_script("arguments[0].value = {};".format(bdate), start_date)
end_date = driver.find_element(By.ID, "to")
driver.execute_script("arguments[0].value = {};".format(edate), end_date)
time.sleep(10)
WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.LINK_TEXT, "Excel CSV"))) # wait till loaded
time.sleep(10)
driver.find_element(By.LINK_TEXT, "Excel CSV").click()
driver.switch_to.default_content()
driver.switch_to.frame("right")
# wait until download finished
while not os.path.exists("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)):
time.sleep(1)
time.sleep(10)
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.LINK_TEXT, "Excel CSV"))) # wait till loaded
if os.path.isfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)):
print('{} {} download successful'.format(station_component.get("title"), time_resolution_ghost[i]))
shutil.copyfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date), storage_location+component_ghost+'_'+time_resolution_ghost[i]+'.csv')
driver.find_element(By.LINK_TEXT, "Excel CSV").click()
os.remove("{}/datos_{}_{}.csv".format(download_location, start_date, end_date))
# wait until download finished
while not os.path.exists("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)):
time.sleep(1)
driver.switch_to.default_content()
driver.switch_to.frame("left")
i=i+1
if os.path.isfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)):
print('{} {} download successful'.format(station_component.get("title"), time_resolution_ghost[i]))
shutil.copyfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date), storage_location+component_ghost+'_'+time_resolution_ghost[i]+'.csv')
os.remove("{}/datos_{}_{}.csv".format(download_location, start_date, end_date))
driver.close()
driver.switch_to.default_content()
driver.switch_to.frame("left")
i=i+1
driver.close()
errcode = 200
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
def scraper_metadata(mode, version):
if n_tries == n_max_tries:
print('Failed downloading CHILE_SINCA data {} times in {} seconds'.format(n_tries, max_time_per_dl))
baseurl = 'https://sinca.mma.gob.cl/index.php/'
metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/temp/CHILE_SINCA_META_temp.txt")
print(metadata15)
# set up driver
options = Options()
#prefs = {'download.default_directory' : download_location}
#options.add_experimental_option('prefs', prefs)
options.add_argument("--no-sandbox")
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
# open url
driver.get(baseurl)
WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded
# navigate to regions
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
for region in regions:
print("Region is "+region.getText())
def download_metadata(n_max_tries, max_time_per_dl):
driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
time.sleep(3)
# paths and variables
variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu']
variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre']
time_resolutions_website = ["registro diario", "registro horario"]
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
stations = soup.find_all("a", href=re.compile(r"^/index.php/estacion/index/id/"))
variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website!
time_resolution_ghost = ['daily', 'hourly']
for station in stations:
baseurl = 'https://sinca.mma.gob.cl/index.php/'
today = date.today()
station_name = station.getText()
print(station_name)
driver.get("https://sinca.mma.gob.cl/"+station.get("href"))
time.sleep(3)
# get meta info
metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/temp/CHILE_SINCA_META_temp.txt")
#print(metadata15)
instruments_old = ['O3_instrument','NO_instrument','NO2_instrument','CO_instrument','CH4_instrument','SO2_instrument','NMHC_instrument','HC_instrument','PM10_instrument','PM2.5_instrument','As_instrument','Cu_instrument','Pb_instrument']
"""
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'r', encoding='utf-8') as f:
metadata_old = json.loads(f.read())
n_tries = 0
errcode = 999
metadata_new = {}
while (n_tries < n_max_tries) and (errcode != 200):
try:
# set up driver
options = Options()
#prefs = {'download.default_directory' : download_location}
#options.add_experimental_option('prefs', prefs)
options.add_argument("--no-sandbox")
options.add_argument("--headless")
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
# open url
driver.get(baseurl)
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded
# navigate to regions
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
region = soup.find("th", text="Región").find_next_sibling().getText()
province = soup.find("th", text="Provincia").find_next_sibling().getText()
commune = soup.find("th", text="Comuna").find_next_sibling().getText()
UTM_coordinates = soup.find("th", text="Coordenadas UTM").find_next_sibling().getText()
timezone = soup.find("th", text="Huso horario").find_next_sibling().getText()
regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
#regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
scraped_metadata = [station_reference, station_name, region, province, commune, UTM_coordinates, timezone]
for region in regions:
metadata15_per_station = metadata15.loc[metadata15["station_name"] == station_name]
print(region)
print(metadata15_per_station)
print(metadata15_per_station["region"].iloc[0])
print("Region is "+region.getText())
i=0
for column in metadata15_per_station.head():
print(column)
driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
time.sleep(1)
if metadata15_per_station[column].iloc[0] == scraped_metadata[i]:
print("ok!")
else:
print("not ok")
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
stations = soup.find_all("a", href=re.compile(r"^/index.php/estacion/index/id/"))
i=i+1
for station in stations:
station_name_new = station.getText()
print(station_name_new)
driver.get("https://sinca.mma.gob.cl/"+station.get("href"))
time.sleep(3)
# get meta info
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
region = soup.find("th", text="Región").find_next_sibling().getText()
province = soup.find("th", text="Provincia").find_next_sibling().getText()
commune = soup.find("th", text="Comuna").find_next_sibling().getText()
UTM_coordinates = soup.find("th", text="Coordenadas UTM").find_next_sibling().getText().replace(' ', '')
lon = UTM_coordinates.split('E')[0]+'E'
lat = UTM_coordinates.split('E')[1].split("\n")[0]
timezone = soup.find("th", text="Huso horario").find_next_sibling().getText().replace(' ', '')
ins_table = soup.find('table', id="medicion")
if ins_table is not None: # check if there are instruments for air pollution at this station
instruments = ins_table.find_all("td", {"class": "helpTecnica center"})
instruments_per_component = {}
else:
continue
for instrument in instruments:
component = instrument.find_parent().find('a').getText()
try: # rename
component = variables_ghost[variables_text.index(component)]
except:
try:
component = variables_ghost[variables_website.index(component)]
except:
pass
if 'Ozono.-' in component:
component = 'O3'
#======
if "No informado" in instrument.getText():
instruments_per_component[component] = ''
else:
instrument_name = re.sub(' +', ' ', instrument.getText())
instrument_name = instrument_name.split("\n")[-1]
instruments_per_component[component] = instrument_name
for station_reference in metadata_old:
if metadata_old[station_reference]['station_name']['values'][0] == station_name_new: # match station with previously referenced station reference from old file
i=0
metadata_new[station_reference] = {} # create inner dictionary
scraped_metadata = [station_reference, station_name_new, region, province, commune, lon, lat, timezone]
for parameter in metadata_old[station_reference]: # loop through the meta parameters
if ("instrument" not in parameter) and ("comments" not in parameter): # go through all that are not instruments
metadata_new[station_reference][parameter] = {"values": [scraped_metadata[i]], "update_time": [today.strftime('%Y-%m-%d')]}
i=i+1
elif "comments" == parameter:
metadata_new[station_reference][parameter] = {"values": [''], "update_time": [today.strftime('%Y-%m-%d')]}
else: # go through the instruments
for component in instruments_per_component:
if component in parameter:
metadata_new[station_reference][parameter] = {"values": [instruments_per_component[component]], "update_time": [today.strftime('%Y-%m-%d')]}
else:
metadata_new[station_reference][parameter] = {"values": [''], "update_time": [today.strftime('%Y-%m-%d')]}
# safe
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META_{}.json'.format(today.strftime('%Y%m%d')), 'w', encoding='utf-8') as f:
f.write(json.dumps(metadata_new, indent=4, ensure_ascii=False))
driver.close()
errcode = 200
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
if n_tries == n_max_tries:
print('Failed downloading CHILE_SINCA metadata {} times in {} seconds'.format(n_tries, max_time_per_dl))"""
driver.close()
"""
# create json from original metadata file =====================================================================================
json_metadata = {}
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META.txt', 'r') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['station_reference']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# read newly scraped file
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META_{}.json'.format(today.strftime('%Y%m%d')), 'r', encoding='utf-8') as f:
json_metadata_now = json.loads(f.read())
# read standardised file to compare!
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'r', encoding='utf-8') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# safe
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
\ No newline at end of file
......@@ -15,14 +15,30 @@ import zipfile
import os.path
import os
import pandas as pd
def scraper(mode, version):
def download_data(mode, version, n_max_tries, max_time_per_dl):
url = 'https://monitoring.eanet.asia/document/menu/index#publicData'
download_location = "/esarchive/obs/ghost/EANET/original_files/{}/".format(version)
today = date.today().strftime('%Y%m%d') #+ timedelta(days = 1)).strftime('%Y%m%d') # problem with timezones???
#print(today)
if mode == 'all':
bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts
edate = date.today()
os.makedirs('/esarchive/obs/ghost/EANET/original_files/{}/'.format(version), exist_ok=True)
download_location = "/esarchive/obs/ghost/EANET/original_files/{}/".format(version)
elif mode == 'nrt':
print("EANET no nrt")
exit()
else:
print('time mode inapplicable')
exit()
options = Options()
prefs = {'download.default_directory' : download_location}
......@@ -32,17 +48,17 @@ def scraper(mode, version):
driver = webdriver.Chrome(service=svc, options=options)
driver.get(url)
time.sleep(2)
time.sleep(max_time_per_dl)
# login
email = driver.find_element(By.ID, "email")
email.send_keys("raphael.grodofzig@bsc.es")
passwd = driver.find_element(By.ID, "passwd")
passwd.send_keys("274s9QZ5")
time.sleep(2)
time.sleep(max_time_per_dl)
driver.find_element(By.NAME, "submitBtn").click()
time.sleep(3)
time.sleep(max_time_per_dl)
# find countries
dropdown_element = driver.find_element(By.ID, 'countryCd')
......@@ -104,4 +120,109 @@ def scraper(mode, version):
i=i+1
driver.close()
\ No newline at end of file
driver.close()
def download_metadata(n_max_tries, max_time_per_dl):
url_metadata = 'https://www.eanet.asia/wp-content/uploads/2024/01/Site_Information_Acid_Deposition_Monitoring_NMP2023_1117.xlsm'
download_location = "/esarchive/obs/ghost/EANET/metadata/network_provided/EANET_META_{}.csv"
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url_metadata, timeout=max_time_per_dl, headers=Headers)
if r.status_code == 200:
with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
metadata = pd.read_excel(download_location.format(today.strftime('%Y%m%d')), engine='pyxlsb').fillna('')
print(metadata)
"""
# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/EANET/metadata/network_provided/EANET_META.csv', 'r', encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'r', encoding='ISO-8859-1') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('{} not in new metadata file'.format(parameter))
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))"""
\ No newline at end of file
import requests
import time
from datetime import date
from datetime import timedelta
import zipfile
import urllib
import os.path
import os
import pandas as pd
def scraper(mode):
url = 'https://monitoring.eanet.asia/document/menu/index#publicData'
download_url = 'https://www.eanet.asia/wp-content/uploads/2024/01/Site_Information_Acid_Deposition_Monitoring_NMP2023_1117.xlsm'
download_location = "/esarchive/obs/ghost/EANET/metadata/network_provided/"
today = date.today().strftime('%Y%m%d') #+ timedelta(days = 1)).strftime('%Y%m%d') # problem with timezones???
"""
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
r = requests.get(download_url, timeout=120, headers=Headers)
print(r.status_code)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+"downloaded_metadata.xlsm")
print('Downloaded metadata')
else:
print('url status not ok')"""
# open file
metadata = pd.read_excel(download_location+"downloaded_metadata.xlsm", engine='pyxlsb').fillna('')
print(metadata)
......@@ -8,20 +8,20 @@ import urllib
import tarfile
import shutil
import gzip
import csv
import json
def scraper(mode, version):
def download_data(mode, version, n_max_tries, max_time_per_dl):
base_url = 'http://www.aire.cdmx.gob.mx/opendata/anuales_horarios_gz/contaminantes_{}.csv.gz'
if mode == 'all':
bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts
edate = date.today()
os.makedirs('/esarchive/obs/ghost/MEXICO_CDMX/original_files/{}/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/MEXICO_CDMX/original_files/'+version+'/contaminantes_{}.csv.gz'
elif mode == 'nrt':
......@@ -39,21 +39,143 @@ def scraper(mode, version):
# download
for year in years:
url = base_url.format(year)
r = requests.get(url, timeout=120)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location.format(year))
print('Downloaded {}'.format(url))
n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url, timeout=max_time_per_dl)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location.format(year))
print('Downloaded {}'.format(url))
# unzip
with gzip.open(download_location.format(year), 'rb') as f_in:
with open(download_location.format(year)[:-3], 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
# remove files
os.remove(download_location.format(year))
errcode = r.status_code
elif r.status_code == 404:
print("No data found, error 404, year {}".format(year))
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(year))
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# unzip
with gzip.open(download_location.format(year), 'rb') as f_in:
with open(download_location.format(year)[:-3], 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
def download_metadata(n_max_tries, max_time_per_dl):
# remove files
os.remove(download_location.format(year))
url_metadata = 'http://www.aire.cdmx.gob.mx/opendata/catalogos/cat_estacion.csv'
download_location = "/esarchive/obs/ghost/MEXICO_CDMX/metadata/network_provided/MEXICO_CDMX_META_{}.csv"
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url_metadata, timeout=max_time_per_dl)
if r.status_code == 200:
with open(download_location.format('_unformatted'), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
print('No {}'.format(url))
time.sleep(1)
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# import it as pandas to clean header
meta_file = pd.read_csv(download_location.format('_unformatted'), header=[1], encoding='ISO-8859-1')
meta_file.to_csv(download_location.format(today.strftime('%Y%m%d')), index=False)
os.remove(download_location.format('_unformatted'))
# create json from original metadata file
"""json_metadata = {}
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/network_provided/MEXICO_CDMX_META.csv', 'r', encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/processed/MEXICO_CDMX_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['cve_estac']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/processed/MEXICO_CDMX_META.json', 'r', encoding='ISO-8859-1') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('{} not in new metadata file'.format(parameter))
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/processed/MEXICO_CDMX_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
\ No newline at end of file
......@@ -22,22 +22,18 @@ from selenium.webdriver.support import expected_conditions as EC
def scraper(mode, version):
def download_data(mode, version, n_max_tries, max_time_per_dl):
baseurl = 'https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos-2001-2021.html'
if mode == 'all':
bdate = date(2001, 1, 1) #date(1960, 1, 1) # date before record starts
edate = date.today()
os.makedirs('/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version)
elif mode == 'nrt':
bdate = date(2024, 3, 2) #date.today() - timedelta(days = 1) # if code is run after 2 am, data from previous day will be available
edate = date(2024, 3, 3) #date.today() - timedelta(days = 1)
print("nrt not available")
download_location = '/esarchive/obs/ghost/MITECO/original_files/nrt/'
else:
......@@ -70,23 +66,40 @@ def scraper(mode, version):
for zip_link in zip_links:
filename = zip_link.get("href").rpartition('/')[-1]
url = 'https://www.miteco.gob.es/{}'.format(zip_link.get("href"))
n_tries = 0
errcode = 999
r = requests.get(url, timeout=120)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+filename)
print('Downloaded {}'.format(filename))
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url, timeout=max_time_per_dl)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+filename)
print('Downloaded {}'.format(filename))
# unzip
with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
zip_ref.extractall(download_location)
os.remove(download_location+filename)
# unzip
with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
zip_ref.extractall(download_location)
os.remove(download_location+filename)
else:
print('No {}'.format(url))
errcode = r.status_code
elif r.status_code == 404:
print("No data found, error 404")
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(url))
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# go to hyperlinks
......@@ -118,21 +131,41 @@ def scraper(mode, version):
os.remove(zip_file)
continue
n_tries = 0
errcode = 999
r = requests.get(url, timeout=120)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+filename)
print('Downloaded {}'.format(filename))
# unzip
with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
zip_ref.extractall(download_location)
os.remove(download_location+filename)
else:
print('No {}'.format(url))
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url, timeout=max_time_per_dl)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+filename)
print('Downloaded {}'.format(filename))
# unzip
with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
zip_ref.extractall(download_location)
os.remove(download_location+filename)
errcode = r.status_code
elif r.status_code == 404:
print("No data found, error 404")
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(url))
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# delete metadata
......@@ -152,3 +185,96 @@ def scraper(mode, version):
driver.close()
def download_metadata(n_max_tries, max_time_per_dl):
url_metadata = 'https://www.miteco.gob.es/content/dam/miteco/es/calidad-y-evaluacion-ambiental/sgalsi/atm%C3%B3sfera-y-calidad-del-aire/evaluaci%C3%B3n-2022/Metainformacion2022.xlsx'
download_location = "/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META_{}.xlsx"
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
r = requests.get(url_metadata, headers=Headers, timeout=max_time_per_dl)
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
if r.status_code == 200:
with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# convert to csv
file = pd.read_excel(download_location.format(today.strftime('%Y%m%d')))
file.to_csv('/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META_{}.csv'.format(today.strftime('%Y%m%d')), index=False, header=True)
"""# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META.csv', 'r', encoding='ISO-8859-1') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['SiteName_NomDuSite']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='ISO-8859-1') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['SiteName_NomDuSite']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'r', encoding='ISO-8859-1') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1]), json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# safe
with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'w', encoding='ISO-8859-1') as f:
f.write(json.dumps(json_metadata, indent=4))"""
......@@ -7,20 +7,21 @@ import re
import os
from datetime import date
from datetime import timedelta
import requests
import csv
import json
import time
def scraper(mode, version):
def download_data(mode, version, n_max_tries, max_time_per_dl):
if mode == 'all':
start_year = 1971
end_year = 2024
elif mode == 'nrt':
start_year = date.today().strftime('%Y')
end_year = (date.today() + timedelta(days=365)).strftime('%Y')
version = mode
else:
......@@ -36,7 +37,7 @@ def scraper(mode, version):
read_url = False
while read_url == False:
try:
link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=15, cafile=certifi.where()).read().decode('utf-8-sig'))
link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=max_time_per_dl, cafile=certifi.where()).read().decode('utf-8-sig'))
read_url = True
except HTTPError as error:
print('Data not retrieved because %s\nURL: %s'%(error, link_url))
......@@ -57,11 +58,10 @@ def scraper(mode, version):
#handles issue of server hanging for 3 minutes spoaradically
#try downloading each link a certain number of times before giving up
n_tries_limit = 3
for link in link_list:
n_tries = 0
errcode = 999
while (n_tries < n_tries_limit) & (errcode != 0):
while (n_tries < n_max_tries) & (errcode != 0):
if n_tries == 0:
print('Checking/Downloading %s'%(link))
else:
......@@ -91,4 +91,103 @@ def scraper(mode, version):
cmd = 'rm {}/{}'.format(specific_directory,lnk)
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
status = process.communicate()[0]
errcode = process.returncode
\ No newline at end of file
errcode = process.returncode
def download_metadata(n_max_tries, max_time_per_dl):
url_metadata = 'https://www.ncei.noaa.gov/pub/data/noaa/isd-history.csv'
download_location = "/esarchive/obs/ghost/NOAA_ISD/metadata/network_provided/NOAA_ISD_META_{}.csv"
n_tries = 0
errcode = 999
today = date.today()
while (n_tries < n_max_tries) and (errcode != 200):
r = requests.get(url_metadata, timeout=max_time_per_dl)
if r.status_code == 200:
with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
outfile.write(r.content)
print('Downloaded metadata')
errcode = r.status_code
elif r.status_code == 404:
print("No metadata found, error 404")
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
max_time_per_dl = max_time_per_dl*2 # increase waiting time
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
time.sleep(1)
"""
# create json from original metadata file
json_metadata = {}
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/network_provided/NOAA_ISD_META.csv', 'r', encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['USAF']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['USAF']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'r', encoding='utf-8') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('{} not in new metadata file'.format(parameter))
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
for parameter in json_metadata_now[station]: # loop through all the parameters
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# is there a new parameter that wasn't in the old file?
if parameter in json_metadata[station].keys():
pass # parameter (column) is already there
else:
print('{} is new'.format(parameter))
json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
# safe
with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
\ No newline at end of file
......@@ -47,15 +47,14 @@ def download_data(mode, version, n_max_tries, max_time_per_dl):
options.add_argument("--no-sandbox")
#options.add_argument("--headless")
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
try:
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
# open url
driver.get(baseurl)
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded
......@@ -105,14 +104,11 @@ def download_data(mode, version, n_max_tries, max_time_per_dl):
print("Number of tries: {}".format(n_tries))
continue
driver.close()
driver.close()
if n_tries == n_max_tries:
print('Failed downloading US_NADP_AMNet data {} times in {} seconds'.format(n_tries, max_time_per_dl))
print(os.path.split(download_location[:-5]))
os.rename("{}AMNET-ALL-h.csv".format(download_location), "/esarchive/obs/ghost/US_NADP_AMNet/original_files/{}/AMNET-ALL-h.csv".format(version))
......@@ -229,7 +225,7 @@ def download_metadata(n_max_tries, max_time_per_dl):
for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("new {} --- old {}".format(json_metadata_now[station][parameter]['values'][0], json_metadata[station][parameter]['values'][-1]))
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
......
......@@ -8,9 +8,9 @@ import pandas
import os.path
import urllib
import time
import ssl
import zipfile
from compare_two_files import compare_files
import json
import csv
from chromedriver_py import binary_path
from selenium.webdriver.chrome.options import Options
......@@ -22,22 +22,16 @@ from selenium.webdriver.support import expected_conditions as EC
def scraper(mode, version):
def download_data(mode, version, n_max_tries, max_time_per_dl):
baseurl = 'https://nadp.slh.wisc.edu/networks/ammonia-monitoring-network/'
if mode == 'all':
bdate = date(2013, 12, 1) #date(1960, 1, 1) # date before record starts
edate = date(2024, 1, 1) #date.today() - timedelta(days = 1)
os.makedirs('/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/'.format(version)
os.makedirs('/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/temp/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/temp/'.format(version)
elif mode == 'nrt':
bdate = date(2024, 1, 1) #date.today() - timedelta(days = 1)
edate = date.today() - timedelta(days = 1)
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/nrt/'
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/nrt/temp/'
else:
print('time mode inapplicable')
......@@ -52,38 +46,198 @@ def scraper(mode, version):
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
try:
# open url
driver.get(baseurl)
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded
dropdown_element = driver.find_element(By.ID, 'data-type')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
select.select_by_visible_text("Bi-weekly")
time.sleep(max_time_per_dl)
dropdown_element = driver.find_element(By.ID, 'sites-list')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
select.select_by_visible_text("All Sites")
time.sleep(max_time_per_dl)
invalid_box = driver.find_element(By.ID, 'invalid')
invalid_box.click()
time.sleep(max_time_per_dl)
# download
driver.find_element(By.ID, 'generate-button-text').click()
# wait until download finished
while not os.path.exists("{}AMoN-ALL-W-i.csv".format(download_location)):
time.sleep(1)
if os.path.isfile("{}AMoN-ALL-W-i.csv".format(download_location)):
print('AMoN-ALL-W-i.csv download successful')
errcode = 200
continue
except TimeoutException as e:
print(e)
max_time_per_dl = max_time_per_dl*2 # set waiting time to double
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
except:
print("Unknown error")
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
driver.close()
if n_tries == n_max_tries:
print('Failed downloading US_NADP_AMoN data {} times in {} seconds'.format(n_tries, max_time_per_dl))
os.rename("{}AMoN-ALL-W-i.csv".format(download_location), "/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/AMoN-ALL-W-i.csv".format(version))
def download_metadata(n_max_tries, max_time_per_dl):
# open url
driver.get(baseurl)
WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded
dropdown_element = driver.find_element(By.ID, 'data-type')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
print(options)
select.select_by_visible_text("Bi-weekly")
time.sleep(3)
dropdown_element = driver.find_element(By.ID, 'sites-list')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
print(options)
select.select_by_visible_text("All Sites")
time.sleep(3)
invalid_box = driver.find_element(By.ID, 'invalid')
invalid_box.click()
time.sleep(3)
# download
driver.find_element(By.ID, 'generate-button-text').click()
baseurl = 'https://nadp.slh.wisc.edu/networks/ammonia-monitoring-network/'
#os.makedirs('/esarchive/obs/ghost/US_NADP_AMNet/metadata/network_provided/', exist_ok=True)
download_location = '/esarchive/obs/ghost/US_NADP_AMoN/metadata/network_provided/'
today = date.today()
# wait until download finished
while not os.path.exists("{}AMoN-ALL-W-i.csv".format(download_location)):
time.sleep(1)
# set up driver
options = Options()
prefs = {'download.default_directory' : download_location}
options.add_experimental_option('prefs', prefs)
options.add_argument("--no-sandbox")
options.add_argument("--headless")
if os.path.isfile("{}AMoN-ALL-W-i.csv".format(download_location)):
print('AMoN-ALL-W-i.csv download successful')
n_tries = 0
errcode = 999
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
driver.maximize_window()
while (n_tries < n_max_tries) and (errcode != 200):
try:
# open url
driver.get(baseurl)
#WebDriverWait(driver, max_time_per_dl).until(EC.element_to_be_clickable((By.ID, 'invalid'))) # wait till loaded
time.sleep(max_time_per_dl)
invalid_box = driver.find_element(By.ID, 'download-show-inactive')
driver.execute_script("arguments[0].click()", invalid_box)
# download
#WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'generate-button-text'))) # wait till loaded
time.sleep(max_time_per_dl)
bttn = driver.find_element(By.ID, 'network-data-submit')
driver.execute_script("arguments[0].click()", bttn)
# wait until download finished
while not os.path.exists(download_location+'amon.csv'):
time.sleep(1)
if os.path.isfile(download_location+'amon.csv'):
print('Amon metadata download successful')
errcode = 200
continue
except TimeoutException as e:
print(e)
max_time_per_dl = max_time_per_dl*2 # set waiting time to double
n_tries = n_tries+1
continue
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
except:
print("Unknown error")
max_time_per_dl = max_time_per_dl*2 # set waiting time to double
n_tries = n_tries+1
continue
if n_tries == n_max_tries:
print('Failed downloading AMoN metadata {} times in {} seconds'.format(n_tries, max_time_per_dl))
driver.close()
os.rename(download_location+'amon.csv', download_location+'US_NADP_AMoN_META_{}.csv'.format(today.strftime('%Y%m%d')))
# create json from original metadata file =====================================================================================
"""json_metadata = {}
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/network_provided/US_NADP_AMoN_META.csv', 'r') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['siteId']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/processed/US_NADP_AMoN_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
"""
# create json in desired shape from current metadata file
json_metadata_now = {}
with open(download_location+'US_NADP_AMoN_META_{}.csv'.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['siteId']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata_now[key] = row
# read standardised file to compare!
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/processed/US_NADP_AMoN_META.json', 'r', encoding='utf-8') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# safe
with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/processed/US_NADP_AMoN_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))
driver.close()
\ No newline at end of file
import hashlib # works for all type of data
import requests
def request_download(url, max_time_per_dl, download_location):
Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
n_tries = 0
errcode = 999
r = requests.get(url, headers=Headers, timeout=max_time_per_dl)
while (n_tries < n_max_tries) and (errcode != 200):
if r.status_code == 200:
open(download_location + file.format(year), "wb").write(r.content)
print('Downloaded {}'.format(file.format(year)))
errcode = r.status_code
elif r.status_code == 404:
print("No ozone l data found, error 404")
errcode = 200
elif r.status_code == 403:
print("Permission denied for {}".format(file.format(year)))
errcode = 200
else:
# try again
print('Response error {}, attempt {}'.format(r.status_code, n_tries))
errcode = r.status_code
n_tries += 1
time.sleep(n_tries ** 2) # wait a lil more every time
if n_tries == n_max_tries:
print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
time.sleep(1)
# check if files are different
......
......@@ -61,7 +61,13 @@ if __name__ == "__main__":
'CNEMC': {'max_time_dl': 3},
'CANADA_NAPS': {'max_time_dl': 5},
'CAPMoN': {'max_time_dl': 5},
'US_NADP_AMNet': {'max_time_dl': 10}}
'US_NADP_AMNet': {'max_time_dl': 10},
'US_NADP_AMoN': {'max_time_dl': 7},
'MEXICO_CDMX': {'max_time_dl': 10},
'NOAA_ISD': {'max_time_dl': 15},
'MITECO': {'max_time_dl': 10},
'EANET': {'max_time_dl': 5},
'CHILE_SINCA': {'max_time_dl': 30}}
# download data
......@@ -71,7 +77,7 @@ if __name__ == "__main__":
dl_metadata = True
# networks you want to download
networks = [US_NADP_AMNet_download]
networks = [CHILE_SINCA_download]
# download all networks
#networks = ['all']
......@@ -107,8 +113,6 @@ if __name__ == "__main__":
if dl_data == True:
network.download_data(mode, version, n_max_tries, max_time_per_dl)
pass
if dl_metadata == True:
network.download_metadata(n_max_tries, max_time_per_dl)
pass