CHILE_SINCA_download.py

from selenium import webdriver
from chromedriver_py import binary_path
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup


from bs4 import BeautifulSoup
import requests
import time
from datetime import date
from datetime import timedelta
import pandas as pd
import os.path
import urllib
import time
import ssl 
import zipfile
import shutil
import os
import re
import csv
import json


def download_data(mode, version, n_max_tries, max_time_per_dl):

    # paths and variables
    variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu'] 
    variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre']
    time_resolutions_website = ["registro diario", "registro horario"] 

    variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website!
    time_resolution_ghost = ['daily', 'hourly']

    metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META.txt")

    baseurl = 'https://sinca.mma.gob.cl/index.php/'

    if mode == 'all':
        # create download directory
        os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
        download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)

    elif mode == 'nrt':
        bdate = date(date.today().year, 1, 1).strftime('%Y%m%d')[2:] #"240101"
        edate = date.today().strftime('%Y%m%d')[2:]
        print(edate)
        # create download directory
        version = mode
        os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
        download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)

    else:
        print('time mode inapplicable')

    n_tries = 0
    errcode = 999

    while (n_tries < n_max_tries) and (errcode != 200):
        try:
            # set up driver
            options = Options()
            prefs = {'download.default_directory' : download_location}
            options.add_experimental_option('prefs', prefs)
            options.add_argument("--no-sandbox")
            options.add_argument("disable-infobars")
            options.add_argument("--disable-extensions")
            options.add_argument("--disable-gpu")
            options.add_argument("--disable-dev-shm-usage")
            if n_tries > 0:
                options.add_argument("--headless")

            svc = webdriver.ChromeService(executable_path=binary_path)
            driver = webdriver.Chrome(service=svc, options=options)

            driver.get(baseurl)
            WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded

            # navigate to regions
            html = driver.page_source
            soup = BeautifulSoup(html, features="html.parser")
            regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
            regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
            

            for region in regions:

                print("Region is "+region.getText())

                driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
                time.sleep(3)

                # navigate to station and component
                html_region = driver.page_source
                soup_region = BeautifulSoup(html_region, features="html.parser")
                a_titles = [a.get("title") for a in soup_region.find_all("a", title=True)] # all links
                # from all the links, choose only the components
                stations_components = []
                for a_title in a_titles:
                    for variable_text in variables_text:
                        if variable_text in a_title:
                            stations_components.append(soup_region.find("a", {"title": a_title}))

                # loop through all stations and components of the region
                for station_component in stations_components:

                    print(station_component.get("title"))
                    station = station_component.get("title").split("| ", 1)[1]
                    component = [x for x in variables_text if x in station_component.get("title")][0] # get component name on website
                    component_choose_time_res = variables_website[variables_text.index(component)] # get component name for choosing time resolution
                    component_ghost = variables_ghost[variables_text.index(component)] # get component name accordingly in ghost

                    # create storage directory
                    try:
                        station_id = metadata15["station_reference"][metadata15["station_name"] == station].iloc[0]
                        os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id), exist_ok=True)
                        storage_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id)
                    except:
                        print("{} added since 1.5, no station_id found in metadata_1.5, did not download data from new station".format(station))
                        continue

                    # go to data on website
                    driver.get('https:'+station_component.get("href"))
                    time.sleep(5)
                    
                    driver.switch_to.frame("left")

                    # select time resolution
                    dropdown_element = driver.find_element(By.ID, 'ic')
                    select = Select(dropdown_element)
                    options = [opt.get_attribute("text") for opt in select.options]
                    
                    i=0
                    for time_resolution in time_resolutions_website:
                        
                        #select time resolution if existent!
                        if (component_choose_time_res+' - '+time_resolution) in options:
                        
                            select.select_by_visible_text(component_choose_time_res+' - '+time_resolution)
                            
                            #print("Time resolution is: {}".format(time_resolution_ghost[i]))
                            time.sleep(5)

                            if mode == "all":
                                start_date = driver.find_element(By.ID, "from").get_attribute("value")
                                end_date = driver.find_element(By.ID, "to").get_attribute("value")
                            if mode == "nrt": # updating dates difficult
                                start_date = driver.find_element(By.ID, "from")
                                driver.execute_script("arguments[0].value = {};".format(bdate), start_date)
                        
                                end_date = driver.find_element(By.ID, "to")
                                driver.execute_script("arguments[0].value = {};".format(edate), end_date) 

                                time.sleep(10)

                            driver.switch_to.default_content()
                            driver.switch_to.frame("right")

                            time.sleep(10)
                            WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.LINK_TEXT, "Excel CSV"))) # wait till loaded

                            driver.find_element(By.LINK_TEXT, "Excel CSV").click()

                            # wait until download finished
                            while not os.path.exists("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)):
                                time.sleep(1)

                                if os.path.isfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)):
                                    print('{} {} download successful'.format(station_component.get("title"), time_resolution_ghost[i]))
                                    shutil.copyfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date), storage_location+component_ghost+'_'+time_resolution_ghost[i]+'.csv')

                            os.remove("{}/datos_{}_{}.csv".format(download_location, start_date, end_date))    

                            driver.switch_to.default_content()
                            driver.switch_to.frame("left")
                            i=i+1
            driver.close()
            errcode = 200

        except WebDriverException as e:
            print(e) 
            n_tries = n_tries+1
            print("Number of tries: {}".format(n_tries))
            continue

    if n_tries == n_max_tries:
        print('Failed downloading CHILE_SINCA data {} times in {} seconds'.format(n_tries, max_time_per_dl))


def download_metadata(n_max_tries, max_time_per_dl):

    # paths and variables
    variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu'] 
    variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre']
    time_resolutions_website = ["registro diario", "registro horario"] 

    variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website!
    time_resolution_ghost = ['daily', 'hourly']

    baseurl = 'https://sinca.mma.gob.cl/index.php/'
    today = date.today()


    metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/temp/CHILE_SINCA_META_temp.txt")
    #print(metadata15)
    instruments_old = ['O3_instrument','NO_instrument','NO2_instrument','CO_instrument','CH4_instrument','SO2_instrument','NMHC_instrument','HC_instrument','PM10_instrument','PM2.5_instrument','As_instrument','Cu_instrument','Pb_instrument']
    """
    with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'r', encoding='utf-8') as f:
        metadata_old = json.loads(f.read())

    n_tries = 0
    errcode = 999
    metadata_new = {}
    while (n_tries < n_max_tries) and (errcode != 200):
        try:
            # set up driver
            options = Options()
            #prefs = {'download.default_directory' : download_location}
            #options.add_experimental_option('prefs', prefs)
            options.add_argument("--no-sandbox")
            options.add_argument("--headless")
            svc = webdriver.ChromeService(executable_path=binary_path)
            driver = webdriver.Chrome(service=svc, options=options)

            # open url
            driver.get(baseurl)
            WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded

            # navigate to regions
            html = driver.page_source
            soup = BeautifulSoup(html, features="html.parser")
            regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
            #regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta

            for region in regions:

                print("Region is "+region.getText())

                driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
                time.sleep(1)

                html = driver.page_source
                soup = BeautifulSoup(html, features="html.parser")
                stations = soup.find_all("a", href=re.compile(r"^/index.php/estacion/index/id/"))
                

                for station in stations:

                    station_name_new = station.getText()
                    print(station_name_new)

                    driver.get("https://sinca.mma.gob.cl/"+station.get("href"))
                    time.sleep(3)
                    
                    # get meta info
                    html = driver.page_source
                    soup = BeautifulSoup(html, features="html.parser")
                    
                    region = soup.find("th", text="Región").find_next_sibling().getText()
                    province = soup.find("th", text="Provincia").find_next_sibling().getText()
                    commune = soup.find("th", text="Comuna").find_next_sibling().getText()
                    UTM_coordinates = soup.find("th", text="Coordenadas UTM").find_next_sibling().getText().replace(' ', '')
                    lon = UTM_coordinates.split('E')[0]+'E'
                    lat = UTM_coordinates.split('E')[1].split("\n")[0]
                    timezone = soup.find("th", text="Huso horario").find_next_sibling().getText().replace(' ', '')

                    ins_table = soup.find('table', id="medicion")
                    if ins_table is not None: # check if there are instruments for air pollution at this station
                        instruments = ins_table.find_all("td", {"class": "helpTecnica center"})
                        instruments_per_component = {}
                    else:
                        continue

                    for instrument in instruments:
                        component = instrument.find_parent().find('a').getText()
                        try: # rename
                            component = variables_ghost[variables_text.index(component)]
                        except:
                            try:
                                component = variables_ghost[variables_website.index(component)]
                            except:
                                pass

                        if 'Ozono.-' in component:
                            component = 'O3'

                        #======
                        if "No informado" in instrument.getText():
                            instruments_per_component[component] = ''
                        else:
                            instrument_name = re.sub(' +', ' ', instrument.getText())
                            instrument_name = instrument_name.split("\n")[-1]
                            instruments_per_component[component] = instrument_name
                    
                    for station_reference in metadata_old:
                        if metadata_old[station_reference]['station_name']['values'][0] == station_name_new: # match station with previously referenced station reference from old file
                            i=0
                            metadata_new[station_reference] = {} # create inner dictionary
                            scraped_metadata = [station_reference, station_name_new, region, province, commune, lon, lat, timezone]
                            for parameter in metadata_old[station_reference]: # loop through the meta parameters
                                if ("instrument" not in parameter) and ("comments" not in parameter): # go through all that are not instruments
                                    metadata_new[station_reference][parameter] = {"values": [scraped_metadata[i]], "update_time": [today.strftime('%Y-%m-%d')]}
                                    i=i+1
                                elif "comments" == parameter:
                                    metadata_new[station_reference][parameter] = {"values": [''], "update_time": [today.strftime('%Y-%m-%d')]}
                                else: # go through the instruments
                                    for component in instruments_per_component:
                                        if component in parameter:
                                            metadata_new[station_reference][parameter] = {"values": [instruments_per_component[component]], "update_time": [today.strftime('%Y-%m-%d')]}
                                        else:
                                            metadata_new[station_reference][parameter] = {"values": [''], "update_time": [today.strftime('%Y-%m-%d')]}
                    
            # safe
            with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META_{}.json'.format(today.strftime('%Y%m%d')), 'w', encoding='utf-8') as f:
                f.write(json.dumps(metadata_new, indent=4, ensure_ascii=False))

            driver.close()
            errcode = 200

        except WebDriverException as e:
            print(e) 
            n_tries = n_tries+1
            print("Number of tries: {}".format(n_tries))
            continue

    if n_tries == n_max_tries:
        print('Failed downloading CHILE_SINCA metadata {} times in {} seconds'.format(n_tries, max_time_per_dl))"""
    

    """
    # create json from original metadata file =====================================================================================
    json_metadata = {}
    with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META.txt', 'r') as file:
        csv_filedata = csv.DictReader(file)

        for row in csv_filedata:  
            key = row['station_reference']
            update_date = today.strftime('%Y-%m-%d')
            for parameter in row:
                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
            json_metadata[key] = row

    with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(json_metadata, indent=4))
    """

    
    # read newly scraped file
    with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META_{}.json'.format(today.strftime('%Y%m%d')), 'r', encoding='utf-8') as f:
        json_metadata_now = json.loads(f.read())

    # read standardised file to compare!
    with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'r', encoding='utf-8') as f:
        json_metadata = json.loads(f.read())

        for station in json_metadata: # loop through all the old stations
            if station in json_metadata_now.keys(): # if station is in current meta data, go on
                for parameter in json_metadata[station]:
                    if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
                        # if different value, append the standardised metadeta file
                        print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
                        json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
                        json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
                    else:
                        pass
            else:
                print('Station {} was abolished'.format(station))

        for station in json_metadata_now: # loop through all the new stations
            if station in json_metadata.keys(): # if station is in old meta data
                pass # comparison was done before
            else: # new station appeared!
                print('New station {}'.format(station))
                json_metadata.update({station: json_metadata_now[station]})


    # safe
    with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(json_metadata, indent=4))