from selenium import webdriver from chromedriver_py import binary_path from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import Select from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from selenium.common.exceptions import WebDriverException from bs4 import BeautifulSoup from bs4 import BeautifulSoup import requests import time from datetime import date from datetime import timedelta import pandas as pd import os.path import urllib import time import ssl import zipfile import shutil import os import re import csv import json def download_data(mode, version, n_max_tries, max_time_per_dl): # paths and variables variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu'] variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre'] time_resolutions_website = ["registro diario", "registro horario"] variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website! time_resolution_ghost = ['daily', 'hourly'] metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META.txt") baseurl = 'https://sinca.mma.gob.cl/index.php/' if mode == 'all': # create download directory os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True) download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version) elif mode == 'nrt': bdate = date(date.today().year, 1, 1).strftime('%Y%m%d')[2:] #"240101" edate = date.today().strftime('%Y%m%d')[2:] print(edate) # create download directory version = mode os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True) download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version) else: print('time mode inapplicable') n_tries = 0 errcode = 999 while (n_tries < n_max_tries) and (errcode != 200): try: # set up driver options = Options() prefs = {'download.default_directory' : download_location} options.add_experimental_option('prefs', prefs) options.add_argument("--no-sandbox") options.add_argument("disable-infobars") options.add_argument("--disable-extensions") options.add_argument("--disable-gpu") options.add_argument("--disable-dev-shm-usage") if n_tries > 0: options.add_argument("--headless") svc = webdriver.ChromeService(executable_path=binary_path) driver = webdriver.Chrome(service=svc, options=options) driver.get(baseurl) WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded # navigate to regions html = driver.page_source soup = BeautifulSoup(html, features="html.parser") regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/")) regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta for region in regions: print("Region is "+region.getText()) driver.get("https://sinca.mma.gob.cl/"+region.get("href")) time.sleep(3) # navigate to station and component html_region = driver.page_source soup_region = BeautifulSoup(html_region, features="html.parser") a_titles = [a.get("title") for a in soup_region.find_all("a", title=True)] # all links # from all the links, choose only the components stations_components = [] for a_title in a_titles: for variable_text in variables_text: if variable_text in a_title: stations_components.append(soup_region.find("a", {"title": a_title})) # loop through all stations and components of the region for station_component in stations_components: print(station_component.get("title")) station = station_component.get("title").split("| ", 1)[1] component = [x for x in variables_text if x in station_component.get("title")][0] # get component name on website component_choose_time_res = variables_website[variables_text.index(component)] # get component name for choosing time resolution component_ghost = variables_ghost[variables_text.index(component)] # get component name accordingly in ghost # create storage directory try: station_id = metadata15["station_reference"][metadata15["station_name"] == station].iloc[0] os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id), exist_ok=True) storage_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id) except: print("{} added since 1.5, no station_id found in metadata_1.5, did not download data from new station".format(station)) continue # go to data on website driver.get('https:'+station_component.get("href")) time.sleep(5) driver.switch_to.frame("left") # select time resolution dropdown_element = driver.find_element(By.ID, 'ic') select = Select(dropdown_element) options = [opt.get_attribute("text") for opt in select.options] i=0 for time_resolution in time_resolutions_website: #select time resolution if existent! if (component_choose_time_res+' - '+time_resolution) in options: select.select_by_visible_text(component_choose_time_res+' - '+time_resolution) #print("Time resolution is: {}".format(time_resolution_ghost[i])) time.sleep(5) if mode == "all": start_date = driver.find_element(By.ID, "from").get_attribute("value") end_date = driver.find_element(By.ID, "to").get_attribute("value") if mode == "nrt": # updating dates difficult start_date = driver.find_element(By.ID, "from") driver.execute_script("arguments[0].value = {};".format(bdate), start_date) end_date = driver.find_element(By.ID, "to") driver.execute_script("arguments[0].value = {};".format(edate), end_date) time.sleep(10) driver.switch_to.default_content() driver.switch_to.frame("right") time.sleep(10) WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.LINK_TEXT, "Excel CSV"))) # wait till loaded driver.find_element(By.LINK_TEXT, "Excel CSV").click() # wait until download finished while not os.path.exists("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)): time.sleep(1) if os.path.isfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)): print('{} {} download successful'.format(station_component.get("title"), time_resolution_ghost[i])) shutil.copyfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date), storage_location+component_ghost+'_'+time_resolution_ghost[i]+'.csv') os.remove("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)) driver.switch_to.default_content() driver.switch_to.frame("left") i=i+1 driver.close() errcode = 200 except WebDriverException as e: print(e) n_tries = n_tries+1 print("Number of tries: {}".format(n_tries)) continue if n_tries == n_max_tries: print('Failed downloading CHILE_SINCA data {} times in {} seconds'.format(n_tries, max_time_per_dl)) def download_metadata(n_max_tries, max_time_per_dl): # paths and variables variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu'] variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre'] time_resolutions_website = ["registro diario", "registro horario"] variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website! time_resolution_ghost = ['daily', 'hourly'] baseurl = 'https://sinca.mma.gob.cl/index.php/' today = date.today() metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/temp/CHILE_SINCA_META_temp.txt") #print(metadata15) instruments_old = ['O3_instrument','NO_instrument','NO2_instrument','CO_instrument','CH4_instrument','SO2_instrument','NMHC_instrument','HC_instrument','PM10_instrument','PM2.5_instrument','As_instrument','Cu_instrument','Pb_instrument'] with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'r', encoding='utf-8') as f: metadata_old = json.loads(f.read()) n_tries = 0 errcode = 999 metadata_new = {} while (n_tries < n_max_tries) and (errcode != 200): try: # set up driver options = Options() #prefs = {'download.default_directory' : download_location} #options.add_experimental_option('prefs', prefs) options.add_argument("--no-sandbox") options.add_argument("--headless") svc = webdriver.ChromeService(executable_path=binary_path) driver = webdriver.Chrome(service=svc, options=options) # open url driver.get(baseurl) WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded # navigate to regions html = driver.page_source soup = BeautifulSoup(html, features="html.parser") regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/")) regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta for region in regions: print("Region is "+region.getText()) driver.get("https://sinca.mma.gob.cl/"+region.get("href")) time.sleep(3) html = driver.page_source soup = BeautifulSoup(html, features="html.parser") stations = soup.find_all("a", href=re.compile(r"^/index.php/estacion/index/id/")) for station in stations: station_name_new = station.getText() print(station_name_new) driver.get("https://sinca.mma.gob.cl/"+station.get("href")) time.sleep(3) # get meta info html = driver.page_source soup = BeautifulSoup(html, features="html.parser") region = soup.find("th", text="Región").find_next_sibling().getText() province = soup.find("th", text="Provincia").find_next_sibling().getText() commune = soup.find("th", text="Comuna").find_next_sibling().getText() UTM_coordinates = soup.find("th", text="Coordenadas UTM").find_next_sibling().getText().replace(' ', '') lon = UTM_coordinates.split('E')[0]+'E' lat = UTM_coordinates.split('E')[1] timezone = soup.find("th", text="Huso horario").find_next_sibling().getText() ins_table = soup.find('table', id="medicion") instruments = ins_table.find_all("td", {"class": "helpTecnica center"}) instruments_per_component = {} for instrument in instruments: component = instrument.find_parent().find('a').getText() if len(component) > 5: # filter for short names that were already given and don't need to be renamed component = variables_ghost[variables_text.index(component)] elif 'MP 1,5' in component: component = 'MP2.5' #====== if "No informado" in instrument.getText(): instruments_per_component[component] = None else: instrument_name = re.sub(' +', ' ', instrument.getText()) instrument_name = instrument_name.split("\n")[-1] instruments_per_component[component] = instrument_name print(instruments_per_component) for station_reference in metadata_old: if metadata_old[station_reference]['station_name']['values'][0] == station_name_new: # match station with previously referenced station reference from old file i=0 scraped_metadata = [station_reference, station_name_new, region, province, commune, lon, lat, timezone] for parameter in metadata_old[station_reference]: #print(parameter) if "instrument" and "comments" not in parameter: metadata_new[station_reference].update({parameter: {"values": [scraped_metadata[i]], "update_time": [today.strftime('%Y-%m-%d')]}}) i=+1 """ elif "comments" == parameter: metadata_new.update({station_reference: {parameter: {"values": [None], "update_time": [today.strftime('%Y-%m-%d')]}}}) #metadata_new[station_reference] = {parameter: {"values": [None], "update_time": [today.strftime('%Y-%m-%d')]}} else: for component in instruments_per_component: if component in parameter: #metadata_new[station_reference] = {parameter: {"values": [instruments_per_component[component]], "update_time": [today.strftime('%Y-%m-%d')]}} metadata_new.update({station_reference: {parameter: {"values": [instruments_per_component[component]], "update_time": [today.strftime('%Y-%m-%d')]}}}) else: #metadata_new[station_reference] = {parameter: {"values": [None], "update_time": [today.strftime('%Y-%m-%d')]}} metadata_new.update({station_reference: {parameter: {"values": [None], "update_time": [today.strftime('%Y-%m-%d')]}}})""" print(metadata_new) """ metadata15_per_station = metadata15.loc[metadata15["station_name"] == station_name] print(region) print(metadata15_per_station) print(metadata15_per_station["region"].iloc[0]) i=0 for column in metadata15_per_station.head(): print(column) if metadata15_per_station[column].iloc[0] == scraped_metadata[i]: print("ok!") else: print("not ok") i=i+1""" driver.close() errcode = 200 except WebDriverException as e: print(e) n_tries = n_tries+1 print("Number of tries: {}".format(n_tries)) continue if n_tries == n_max_tries: print('Failed downloading CHILE_SINCA metadata {} times in {} seconds'.format(n_tries, max_time_per_dl)) print(metadata_new) """ # create json from original metadata file ===================================================================================== json_metadata = {} with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META.txt', 'r') as file: csv_filedata = csv.DictReader(file) for row in csv_filedata: key = row['station_reference'] update_date = today.strftime('%Y-%m-%d') for parameter in row: row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter json_metadata[key] = row with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'w', encoding='utf-8') as f: f.write(json.dumps(json_metadata, indent=4))""" """ # read standardised file to compare! with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'r', encoding='utf-8') as f: json_metadata = json.loads(f.read()) for station in json_metadata: # loop through all the old stations if station in json_metadata_now.keys(): # if station is in current meta data, go on for parameter in json_metadata[station]: if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file # if different value, append the standardised metadeta file print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0])) json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0]) json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0]) else: pass else: print('Station {} was abolished'.format(station)) for station in json_metadata_now: # loop through all the new stations if station in json_metadata.keys(): # if station is in old meta data pass # comparison was done before else: # new station appeared! print('New station {}'.format(station)) json_metadata.update({station: json_metadata_now[station]}) # safe with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'w', encoding='utf-8') as f: f.write(json.dumps(json_metadata, indent=4))"""