CHILE_SINCA_download.py

from selenium import webdriver
from chromedriver_py import binary_path
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup


from bs4 import BeautifulSoup
import requests
import time
from datetime import date
from datetime import timedelta
import pandas as pd
import os.path
import urllib
import time
import ssl 
import zipfile
import shutil
import os
import re


def scraper(mode, version):

    # paths and variables
    variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu'] # complete list later
    variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre']
    time_resolutions_website = ["registro diario", "registro horario"] # complete later

    variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website!
    time_resolution_ghost = ['daily', 'hourly']

    metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META.txt")

    baseurl = 'https://sinca.mma.gob.cl/index.php/'

    # only for nrt
    bdate = "240101"
    edate = date.today().strftime('%Y%m%d')[2:]
    print(edate)

    # create download directory
    os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
    download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)

    # set up driver
    options = Options()
    prefs = {'download.default_directory' : download_location}
    options.add_experimental_option('prefs', prefs)
    options.add_argument("--no-sandbox")
    options.add_argument("disable-infobars")
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    svc = webdriver.ChromeService(executable_path=binary_path)
    driver = webdriver.Chrome(service=svc, options=options)

    # open url
    driver.get(baseurl)
    WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded

    # navigate to regions
    html = driver.page_source
    soup = BeautifulSoup(html, features="html.parser")
    regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
    regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
    

    for region in regions:

        print("Region is "+region.getText())

        driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
        time.sleep(3)

        # navigate to station and component
        html_region = driver.page_source
        soup_region = BeautifulSoup(html_region, features="html.parser")
        a_titles = [a.get("title") for a in soup_region.find_all("a", title=True)] # all links
        # from all the links, choose only the components
        stations_components = []
        for a_title in a_titles:
            for variable_text in variables_text:
                if variable_text in a_title:
                    stations_components.append(soup_region.find("a", {"title": a_title}))

        # loop through all stations and components of the region
        for station_component in stations_components:

            print(station_component.get("title"))
            station = station_component.get("title").split("| ", 1)[1]
            component = [x for x in variables_text if x in station_component.get("title")][0] # get component name on website
            component_choose_time_res = variables_website[variables_text.index(component)] # get component name for choosing time resolution
            component_ghost = variables_ghost[variables_text.index(component)] # get component name accordingly in ghost

            # create storage directory
            try:
                station_id = metadata15["station_reference"][metadata15["station_name"] == station].iloc[0]
                os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id), exist_ok=True)
                storage_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id)
            except:
                print("{} added since 1.5, no station_id found in metadata_1.5, did not download data from new station".format(station))
                continue

            # go to data on website
            driver.get('https:'+station_component.get("href"))
            time.sleep(5)
            
            driver.switch_to.frame("left")

            # select time resolution
            dropdown_element = driver.find_element(By.ID, 'ic')
            select = Select(dropdown_element)
            options = [opt.get_attribute("text") for opt in select.options]
            
            i=0
            for time_resolution in time_resolutions_website:
                
                #select time resolution if existent!
                if (component_choose_time_res+' - '+time_resolution) in options:
                
                    select.select_by_visible_text(component_choose_time_res+' - '+time_resolution)
                    
                    #print("Time resolution is: {}".format(time_resolution_ghost[i]))
                    time.sleep(5)

                    if mode == "all":
                        start_date = driver.find_element(By.ID, "from").get_attribute("value")
                        end_date = driver.find_element(By.ID, "to").get_attribute("value")
                    if mode == "nrt": # updating dates difficult
                        start_date = driver.find_element(By.ID, "from")
                        driver.execute_script("arguments[0].value = {};".format(bdate), start_date)
                
                        end_date = driver.find_element(By.ID, "to")
                        driver.execute_script("arguments[0].value = {};".format(edate), end_date) 

                        time.sleep(10)

                    driver.switch_to.default_content()
                    driver.switch_to.frame("right")

                    time.sleep(10)
                    WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.LINK_TEXT, "Excel CSV"))) # wait till loaded

                    driver.find_element(By.LINK_TEXT, "Excel CSV").click()

                    # wait until download finished
                    while not os.path.exists("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)):
                        time.sleep(1)

                        if os.path.isfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)):
                            print('{} {} download successful'.format(station_component.get("title"), time_resolution_ghost[i]))
                            shutil.copyfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date), storage_location+component_ghost+'_'+time_resolution_ghost[i]+'.csv')

                    os.remove("{}/datos_{}_{}.csv".format(download_location, start_date, end_date))    

                    driver.switch_to.default_content()
                    driver.switch_to.frame("left")
                    i=i+1


    driver.close()


def scraper_metadata(mode, version):

    baseurl = 'https://sinca.mma.gob.cl/index.php/'

    metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/temp/CHILE_SINCA_META_temp.txt")
    print(metadata15)

    # set up driver
    options = Options()
    #prefs = {'download.default_directory' : download_location}
    #options.add_experimental_option('prefs', prefs)
    options.add_argument("--no-sandbox")
    svc = webdriver.ChromeService(executable_path=binary_path)
    driver = webdriver.Chrome(service=svc, options=options)

    # open url
    driver.get(baseurl)
    WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded

    # navigate to regions
    html = driver.page_source
    soup = BeautifulSoup(html, features="html.parser")
    regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
    regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
    

    for region in regions:

        print("Region is "+region.getText())

        driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
        time.sleep(3)

        html = driver.page_source
        soup = BeautifulSoup(html, features="html.parser")
        stations = soup.find_all("a", href=re.compile(r"^/index.php/estacion/index/id/"))
        

        for station in stations:

            station_name = station.getText()
            print(station_name)

            driver.get("https://sinca.mma.gob.cl/"+station.get("href"))
            time.sleep(3)
            
            # get meta info
            html = driver.page_source
            soup = BeautifulSoup(html, features="html.parser")
            
            region = soup.find("th", text="Región").find_next_sibling().getText()
            province = soup.find("th", text="Provincia").find_next_sibling().getText()
            commune = soup.find("th", text="Comuna").find_next_sibling().getText()
            UTM_coordinates = soup.find("th", text="Coordenadas UTM").find_next_sibling().getText()
            timezone = soup.find("th", text="Huso horario").find_next_sibling().getText()

            scraped_metadata = [station_reference, station_name, region, province, commune, UTM_coordinates, timezone]

            metadata15_per_station = metadata15.loc[metadata15["station_name"] == station_name]
            print(region)
            print(metadata15_per_station)
            print(metadata15_per_station["region"].iloc[0])

            i=0
            for column in metadata15_per_station.head():
                print(column)

                if metadata15_per_station[column].iloc[0] == scraped_metadata[i]:
                    print("ok!")
                else:
                    print("not ok")

                i=i+1


    driver.close()