CHILE_SINCA_download.py 20.3 KB
Newer Older
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
from selenium import webdriver
from chromedriver_py import binary_path
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
from bs4 import BeautifulSoup


from bs4 import BeautifulSoup
import requests
import time
from datetime import date
from datetime import timedelta
import pandas as pd
import os.path
import urllib
import time
import ssl 
import zipfile
import shutil
import os
import re
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
import csv
import json
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
def download_data(mode, version, n_max_tries, max_time_per_dl):
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed

    # paths and variables
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu'] 
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre']
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    time_resolutions_website = ["registro diario", "registro horario"] 
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed

    variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website!
    time_resolution_ghost = ['daily', 'hourly']

    metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META.txt")

    baseurl = 'https://sinca.mma.gob.cl/index.php/'

RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    if mode == 'all':
        # create download directory
        os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
        download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)

    elif mode == 'nrt':
        bdate = date(date.today().year, 1, 1).strftime('%Y%m%d')[2:] #"240101"
        edate = date.today().strftime('%Y%m%d')[2:]
        print(edate)
        # create download directory
        version = mode
        os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
        download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)

    else:
        print('time mode inapplicable')

    n_tries = 0
    errcode = 999

    while (n_tries < n_max_tries) and (errcode != 200):
        try:
            # set up driver
            options = Options()
            prefs = {'download.default_directory' : download_location}
            options.add_experimental_option('prefs', prefs)
            options.add_argument("--no-sandbox")
            options.add_argument("disable-infobars")
            options.add_argument("--disable-extensions")
            options.add_argument("--disable-gpu")
            options.add_argument("--disable-dev-shm-usage")
            if n_tries > 0:
                options.add_argument("--headless")

            svc = webdriver.ChromeService(executable_path=binary_path)
            driver = webdriver.Chrome(service=svc, options=options)

            driver.get(baseurl)
            WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded

            # navigate to regions
            html = driver.page_source
            soup = BeautifulSoup(html, features="html.parser")
            regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
            regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
            for region in regions:

                print("Region is "+region.getText())

                driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
                time.sleep(3)

                # navigate to station and component
                html_region = driver.page_source
                soup_region = BeautifulSoup(html_region, features="html.parser")
                a_titles = [a.get("title") for a in soup_region.find_all("a", title=True)] # all links
                # from all the links, choose only the components
                stations_components = []
                for a_title in a_titles:
                    for variable_text in variables_text:
                        if variable_text in a_title:
                            stations_components.append(soup_region.find("a", {"title": a_title}))

                # loop through all stations and components of the region
                for station_component in stations_components:

                    print(station_component.get("title"))
                    station = station_component.get("title").split("| ", 1)[1]
                    component = [x for x in variables_text if x in station_component.get("title")][0] # get component name on website
                    component_choose_time_res = variables_website[variables_text.index(component)] # get component name for choosing time resolution
                    component_ghost = variables_ghost[variables_text.index(component)] # get component name accordingly in ghost

                    # create storage directory
                    try:
                        station_id = metadata15["station_reference"][metadata15["station_name"] == station].iloc[0]
                        os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id), exist_ok=True)
                        storage_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id)
                    except:
                        print("{} added since 1.5, no station_id found in metadata_1.5, did not download data from new station".format(station))
                        continue

                    # go to data on website
                    driver.get('https:'+station_component.get("href"))
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                    time.sleep(5)
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                    
                    driver.switch_to.frame("left")
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                    # select time resolution
                    dropdown_element = driver.find_element(By.ID, 'ic')
                    select = Select(dropdown_element)
                    options = [opt.get_attribute("text") for opt in select.options]
                    
                    i=0
                    for time_resolution in time_resolutions_website:
                        
                        #select time resolution if existent!
                        if (component_choose_time_res+' - '+time_resolution) in options:
                        
                            select.select_by_visible_text(component_choose_time_res+' - '+time_resolution)
                            
                            #print("Time resolution is: {}".format(time_resolution_ghost[i]))
                            time.sleep(5)
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                            if mode == "all":
                                start_date = driver.find_element(By.ID, "from").get_attribute("value")
                                end_date = driver.find_element(By.ID, "to").get_attribute("value")
                            if mode == "nrt": # updating dates difficult
                                start_date = driver.find_element(By.ID, "from")
                                driver.execute_script("arguments[0].value = {};".format(bdate), start_date)
                        
                                end_date = driver.find_element(By.ID, "to")
                                driver.execute_script("arguments[0].value = {};".format(edate), end_date) 
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                                time.sleep(10)
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                            driver.switch_to.default_content()
                            driver.switch_to.frame("right")
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                            time.sleep(10)
                            WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.LINK_TEXT, "Excel CSV"))) # wait till loaded
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                            driver.find_element(By.LINK_TEXT, "Excel CSV").click()
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                            # wait until download finished
                            while not os.path.exists("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)):
                                time.sleep(1)
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                                if os.path.isfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)):
                                    print('{} {} download successful'.format(station_component.get("title"), time_resolution_ghost[i]))
                                    shutil.copyfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date), storage_location+component_ghost+'_'+time_resolution_ghost[i]+'.csv')
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                            os.remove("{}/datos_{}_{}.csv".format(download_location, start_date, end_date))    
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                            driver.switch_to.default_content()
                            driver.switch_to.frame("left")
                            i=i+1
            driver.close()
            errcode = 200
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
        except WebDriverException as e:
            print(e) 
            n_tries = n_tries+1
            print("Number of tries: {}".format(n_tries))
            continue
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    if n_tries == n_max_tries:
        print('Failed downloading CHILE_SINCA data {} times in {} seconds'.format(n_tries, max_time_per_dl))
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
def download_metadata(n_max_tries, max_time_per_dl):
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    # paths and variables
    variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu'] 
    variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre']
    time_resolutions_website = ["registro diario", "registro horario"] 
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website!
    time_resolution_ghost = ['daily', 'hourly']
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    baseurl = 'https://sinca.mma.gob.cl/index.php/'
    today = date.today()
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/temp/CHILE_SINCA_META_temp.txt")
    #print(metadata15)
    instruments_old = ['O3_instrument','NO_instrument','NO2_instrument','CO_instrument','CH4_instrument','SO2_instrument','NMHC_instrument','HC_instrument','PM10_instrument','PM2.5_instrument','As_instrument','Cu_instrument','Pb_instrument']

    with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'r', encoding='utf-8') as f:
        metadata_old = json.loads(f.read())

    n_tries = 0
    errcode = 999
    metadata_new = {}
    while (n_tries < n_max_tries) and (errcode != 200):
        try:
            # set up driver
            options = Options()
            #prefs = {'download.default_directory' : download_location}
            #options.add_experimental_option('prefs', prefs)
            options.add_argument("--no-sandbox")
            options.add_argument("--headless")
            svc = webdriver.ChromeService(executable_path=binary_path)
            driver = webdriver.Chrome(service=svc, options=options)

            # open url
            driver.get(baseurl)
            WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded

            # navigate to regions
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
            html = driver.page_source
            soup = BeautifulSoup(html, features="html.parser")
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
            regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
            regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
            for region in regions:
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                print("Region is "+region.getText())
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
                time.sleep(3)
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                html = driver.page_source
                soup = BeautifulSoup(html, features="html.parser")
                stations = soup.find_all("a", href=re.compile(r"^/index.php/estacion/index/id/"))
                
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                for station in stations:
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                    station_name_new = station.getText()
                    print(station_name_new)
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                    driver.get("https://sinca.mma.gob.cl/"+station.get("href"))
                    time.sleep(3)
                    
                    # get meta info
                    html = driver.page_source
                    soup = BeautifulSoup(html, features="html.parser")
                    
                    region = soup.find("th", text="Región").find_next_sibling().getText()
                    province = soup.find("th", text="Provincia").find_next_sibling().getText()
                    commune = soup.find("th", text="Comuna").find_next_sibling().getText()
                    UTM_coordinates = soup.find("th", text="Coordenadas UTM").find_next_sibling().getText().replace(' ', '')
                    lon = UTM_coordinates.split('E')[0]+'E'
                    lat = UTM_coordinates.split('E')[1]
                    timezone = soup.find("th", text="Huso horario").find_next_sibling().getText()

                    ins_table = soup.find('table', id="medicion")
                    instruments = ins_table.find_all("td", {"class": "helpTecnica center"})
                    instruments_per_component = {}
                    for instrument in instruments:
                        component = instrument.find_parent().find('a').getText()
                        if len(component) > 5: # filter for short names that were already given and don't need to be renamed
                            component = variables_ghost[variables_text.index(component)]
                        elif 'MP 1,5' in component:
                            component =  'MP2.5'
                        #======
                        if "No informado" in instrument.getText():
                            instruments_per_component[component] = None
                        else:
                            instrument_name = re.sub(' +', ' ', instrument.getText())
                            instrument_name = instrument_name.split("\n")[-1]
                            instruments_per_component[component] = instrument_name

                    print(instruments_per_component)
                    
                    for station_reference in metadata_old:
                        if metadata_old[station_reference]['station_name']['values'][0] == station_name_new: # match station with previously referenced station reference from old file
                            i=0
                            scraped_metadata = [station_reference, station_name_new, region, province, commune, lon, lat, timezone]
                            for parameter in metadata_old[station_reference]:
                                #print(parameter)
                                if "instrument" and "comments" not in parameter:
                                    metadata_new[station_reference].update({parameter: {"values": [scraped_metadata[i]], "update_time": [today.strftime('%Y-%m-%d')]}})
                                    i=+1
                                """
                                elif "comments" == parameter:
                                    metadata_new.update({station_reference: {parameter: {"values": [None], "update_time": [today.strftime('%Y-%m-%d')]}}})
                                    #metadata_new[station_reference] = {parameter: {"values": [None], "update_time": [today.strftime('%Y-%m-%d')]}}
                                else:
                                    for component in instruments_per_component:
                                        if component in parameter:
                                            #metadata_new[station_reference] = {parameter: {"values": [instruments_per_component[component]], "update_time": [today.strftime('%Y-%m-%d')]}}
                                            metadata_new.update({station_reference: {parameter: {"values": [instruments_per_component[component]], "update_time": [today.strftime('%Y-%m-%d')]}}})
                                        else:
                                            #metadata_new[station_reference] = {parameter: {"values": [None], "update_time": [today.strftime('%Y-%m-%d')]}}
                                            metadata_new.update({station_reference: {parameter: {"values": [None], "update_time": [today.strftime('%Y-%m-%d')]}}})"""
                                            
                                
                                           
                            
                    print(metadata_new)
                    """
                    metadata15_per_station = metadata15.loc[metadata15["station_name"] == station_name]
                    print(region)
                    print(metadata15_per_station)
                    print(metadata15_per_station["region"].iloc[0])

                    i=0
                    for column in metadata15_per_station.head():
                        print(column)

                        if metadata15_per_station[column].iloc[0] == scraped_metadata[i]:
                            print("ok!")
                        else:
                            print("not ok")

                        i=i+1"""
            driver.close()
            errcode = 200

        except WebDriverException as e:
            print(e) 
            n_tries = n_tries+1
            print("Number of tries: {}".format(n_tries))
            continue

    if n_tries == n_max_tries:
        print('Failed downloading CHILE_SINCA metadata {} times in {} seconds'.format(n_tries, max_time_per_dl))
    
    print(metadata_new)

    """
    # create json from original metadata file =====================================================================================
    json_metadata = {}
    with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META.txt', 'r') as file:
        csv_filedata = csv.DictReader(file)

        for row in csv_filedata:  
            key = row['station_reference']
            update_date = today.strftime('%Y-%m-%d')
            for parameter in row:
                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
            json_metadata[key] = row

    with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(json_metadata, indent=4))"""

    """
    # read standardised file to compare!
    with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'r', encoding='utf-8') as f:
        json_metadata = json.loads(f.read())

        for station in json_metadata: # loop through all the old stations
            if station in json_metadata_now.keys(): # if station is in current meta data, go on
                for parameter in json_metadata[station]:
                    if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
                        # if different value, append the standardised metadeta file
                        print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
                        json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
                        json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
                    else:
                        pass
            else:
                print('Station {} was abolished'.format(station))

        for station in json_metadata_now: # loop through all the new stations
            if station in json_metadata.keys(): # if station is in old meta data
                pass # comparison was done before
            else: # new station appeared!
                print('New station {}'.format(station))
                json_metadata.update({station: json_metadata_now[station]})


    # safe
    with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(json_metadata, indent=4))"""