from selenium import webdriver from chromedriver_py import binary_path from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import Select from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup from bs4 import BeautifulSoup import requests import time from datetime import date from datetime import timedelta import pandas as pd import os.path import urllib import time import ssl import zipfile import shutil import os import re def scraper(mode, version): # paths and variables variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu'] # complete list later variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre'] time_resolutions_website = ["registro diario", "registro horario"] # complete later variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website! time_resolution_ghost = ['daily', 'hourly'] metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META.txt") baseurl = 'https://sinca.mma.gob.cl/index.php/' # only for nrt bdate = "240101" edate = date.today().strftime('%Y%m%d')[2:] print(edate) # create download directory os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True) download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version) # set up driver options = Options() prefs = {'download.default_directory' : download_location} options.add_experimental_option('prefs', prefs) options.add_argument("--no-sandbox") svc = webdriver.ChromeService(executable_path=binary_path) driver = webdriver.Chrome(service=svc, options=options) # open url driver.get(baseurl) WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded # navigate to regions html = driver.page_source soup = BeautifulSoup(html, features="html.parser") regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/")) regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta for region in regions: print("Region is "+region.getText()) driver.get("https://sinca.mma.gob.cl/"+region.get("href")) time.sleep(3) # navigate to station and component html_region = driver.page_source soup_region = BeautifulSoup(html_region, features="html.parser") a_titles = [a.get("title") for a in soup_region.find_all("a", title=True)] # all links # from all the links, choose only the components stations_components = [] for a_title in a_titles: for variable_text in variables_text: if variable_text in a_title: stations_components.append(soup_region.find("a", {"title": a_title})) # loop through all stations and components of the region for station_component in stations_components: print(station_component.get("title")) station = station_component.get("title").split("| ", 1)[1] component = [x for x in variables_text if x in station_component.get("title")][0] # get component name on website component_choose_time_res = variables_website[variables_text.index(component)] # get component name for choosing time resolution component_ghost = variables_ghost[variables_text.index(component)] # get component name accordingly in ghost # create storage directory try: station_id = metadata15["station_reference"][metadata15["station_name"] == station].iloc[0] os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id), exist_ok=True) storage_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id) except: print("{} added since 1.5, no station_id found in metadata_1.5, did not download data from new station".format(station)) continue # go to data on website driver.get('https:'+station_component.get("href")) time.sleep(5) driver.switch_to.frame("left") # select time resolution dropdown_element = driver.find_element(By.ID, 'ic') select = Select(dropdown_element) options = [opt.get_attribute("text") for opt in select.options] i=0 for time_resolution in time_resolutions_website: #select time resolution if existent! if (component_choose_time_res+' - '+time_resolution) in options: select.select_by_visible_text(component_choose_time_res+' - '+time_resolution) #print("Time resolution is: {}".format(time_resolution_ghost[i])) time.sleep(5) if mode == "all": start_date = driver.find_element(By.ID, "from").get_attribute("value") end_date = driver.find_element(By.ID, "to").get_attribute("value") if mode == "nrt": # updating dates difficult start_date = driver.find_element(By.ID, "from") driver.execute_script("arguments[0].value = {};".format(bdate), start_date) end_date = driver.find_element(By.ID, "to") driver.execute_script("arguments[0].value = {};".format(edate), end_date) time.sleep(10) driver.switch_to.default_content() driver.switch_to.frame("right") time.sleep(10) WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.LINK_TEXT, "Excel CSV"))) # wait till loaded driver.find_element(By.LINK_TEXT, "Excel CSV").click() # wait until download finished while not os.path.exists("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)): time.sleep(1) if os.path.isfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)): print('{} {} download successful'.format(station_component.get("title"), time_resolution_ghost[i])) shutil.copyfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date), storage_location+component_ghost+'_'+time_resolution_ghost[i]+'.csv') os.remove("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)) driver.switch_to.default_content() driver.switch_to.frame("left") i=i+1 driver.close() def scraper_metadata(mode, version): baseurl = 'https://sinca.mma.gob.cl/index.php/' metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/temp/CHILE_SINCA_META_temp.txt") print(metadata15) # set up driver options = Options() #prefs = {'download.default_directory' : download_location} #options.add_experimental_option('prefs', prefs) options.add_argument("--no-sandbox") svc = webdriver.ChromeService(executable_path=binary_path) driver = webdriver.Chrome(service=svc, options=options) # open url driver.get(baseurl) WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded # navigate to regions html = driver.page_source soup = BeautifulSoup(html, features="html.parser") regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/")) regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta for region in regions: print("Region is "+region.getText()) driver.get("https://sinca.mma.gob.cl/"+region.get("href")) time.sleep(3) html = driver.page_source soup = BeautifulSoup(html, features="html.parser") stations = soup.find_all("a", href=re.compile(r"^/index.php/estacion/index/id/")) for station in stations: station_name = station.getText() print(station_name) driver.get("https://sinca.mma.gob.cl/"+station.get("href")) time.sleep(3) # get meta info html = driver.page_source soup = BeautifulSoup(html, features="html.parser") region = soup.find("th", text="Región").find_next_sibling().getText() province = soup.find("th", text="Provincia").find_next_sibling().getText() commune = soup.find("th", text="Comuna").find_next_sibling().getText() UTM_coordinates = soup.find("th", text="Coordenadas UTM").find_next_sibling().getText() timezone = soup.find("th", text="Huso horario").find_next_sibling().getText() scraped_metadata = [station_reference, station_name, region, province, commune, UTM_coordinates, timezone] metadata15_per_station = metadata15.loc[metadata15["station_name"] == station_name] print(region) print(metadata15_per_station) print(metadata15_per_station["region"].iloc[0]) i=0 for column in metadata15_per_station.head(): print(column) if metadata15_per_station[column].iloc[0] == scraped_metadata[i]: print("ok!") else: print("not ok") i=i+1 driver.close()