EANET_download.py

from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from chromedriver_py import binary_path
from selenium.webdriver.chrome.options import Options
import requests
import time
from datetime import date
from datetime import timedelta
import zipfile

import os.path
import os
import pandas as pd


def download_data(mode, version, n_max_tries, max_time_per_dl):
    
    url = 'https://monitoring.eanet.asia/document/menu/index#publicData'
    today = date.today().strftime('%Y%m%d') #+ timedelta(days = 1)).strftime('%Y%m%d') # problem with timezones???
    
    if mode == 'all':
        bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts
        edate = date.today()

        os.makedirs('/esarchive/obs/ghost/EANET/original_files/{}/'.format(version), exist_ok=True)
        download_location = "/esarchive/obs/ghost/EANET/original_files/{}/".format(version)


    elif mode == 'nrt':
        print("EANET no nrt")
        exit()

    else:
        print('time mode inapplicable')
        exit()
    

    options = Options()
    prefs = {'download.default_directory' : download_location}
    options.add_experimental_option('prefs', prefs)
    options.add_argument("--no-sandbox")
    svc = webdriver.ChromeService(executable_path=binary_path)
    driver = webdriver.Chrome(service=svc, options=options)

    driver.get(url)
    time.sleep(max_time_per_dl)

    # login
    email = driver.find_element(By.ID, "email") 
    email.send_keys("raphael.grodofzig@bsc.es")
    passwd = driver.find_element(By.ID, "passwd") 
    passwd.send_keys("274s9QZ5")
    time.sleep(max_time_per_dl)
    driver.find_element(By.NAME, "submitBtn").click()

    time.sleep(max_time_per_dl)

    # find countries
    dropdown_element = driver.find_element(By.ID, 'countryCd')
    select = Select(dropdown_element)
    countries = []
    COUNTRIES = []
    for country in select.options:
        if country.text == "-":
            pass
        else:
            countries.append(str(country.text))

            if country.text == "Korea, Republic of":
                COUNTRIES.append("SOUTH_KOREA")
            elif country.text == "Lao People's Democratic Republic":
                COUNTRIES.append("LAOS")
            elif country.text == "Viet Nam":
                COUNTRIES.append("VIETNAM")
            else:
                COUNTRIES.append(str(country.text).upper())

    print(countries)
    print(COUNTRIES)
 
    #countries = ["Cambodia", "China"]
    #COUNTRIES = ["CAMBODIA", "CHINA"]

    # download
    i=0
    for country in countries:
        # create download directory
        os.makedirs(download_location+COUNTRIES[i]+'/', exist_ok=True)

        dropdown_element = driver.find_element(By.ID, 'countryCd')
        select = Select(dropdown_element)
        select.select_by_visible_text(country)
        
        print("successfully selected {}".format(country))

        # start download
        driver.find_element(By.CSS_SELECTOR, "input[id='downloadBtn'][type='button']").click()

        # wait until download finished
        while not os.path.exists("{}/detail_{}.zip".format(download_location+COUNTRIES[i], today)):
            time.sleep(1)

            if os.path.isfile("{}/detail_{}.zip".format(download_location, today)):
                print('{} download successful'.format(country))

                # move to country directory
                os.rename("{}/detail_{}.zip".format(download_location, today), "{}/detail_{}.zip".format(download_location+COUNTRIES[i], today))

                # unzip
                with zipfile.ZipFile("{}/detail_{}.zip".format(download_location+COUNTRIES[i], today), 'r') as zip_ref:
                    zip_ref.extractall(os.path.dirname("{}/detail_{}.zip".format(download_location+COUNTRIES[i], today)))


        os.remove("{}/detail_{}.zip".format(download_location+COUNTRIES[i], today))

        i=i+1

    driver.close()


def download_metadata(n_max_tries, max_time_per_dl):

    url_metadata = 'https://www.eanet.asia/wp-content/uploads/2024/01/Site_Information_Acid_Deposition_Monitoring_NMP2023_1117.xlsm'
    download_location = "/esarchive/obs/ghost/EANET/metadata/network_provided/EANET_META_{}.csv"
    Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}

    n_tries = 0
    errcode = 999
    today = date.today()
    
    while (n_tries < n_max_tries) and (errcode != 200):
        r = requests.get(url_metadata, timeout=max_time_per_dl, headers=Headers)
        if r.status_code == 200:
            with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
                outfile.write(r.content)
            print('Downloaded metadata')
            errcode = r.status_code
        elif r.status_code == 404:
            print("No metadata found, error 404")
            errcode = 200
        else:
            # try again
            print('Response error {}, attempt {}'.format(r.status_code, n_tries))
            errcode = r.status_code
            n_tries += 1
            max_time_per_dl = max_time_per_dl*2 # increase waiting time 
            time.sleep(n_tries ** 2) # wait a lil more every time 

    if n_tries == n_max_tries:
        print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
    time.sleep(1)

    metadata = pd.read_excel(download_location.format(today.strftime('%Y%m%d')), engine='pyxlsb').fillna('')
    print(metadata)
    """
    # create json from original metadata file
    json_metadata = {}
    with open('/esarchive/obs/ghost/EANET/metadata/network_provided/EANET_META.csv', 'r', encoding='utf-8') as file:
        csv_filedata = csv.DictReader(file)

        for row in csv_filedata:  
            key = row['cve_estac']
            update_date = today.strftime('%Y-%m-%d')
            for parameter in row:
                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
            json_metadata[key] = row

    with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(json_metadata, indent=4))
        
    """
    """
    # create json in desired shape from current metadata file
    json_metadata_now = {}  
    with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
        csv_filedata = csv.DictReader(file)

        for row in csv_filedata:  
            key = row['cve_estac']
            update_date = today.strftime('%Y-%m-%d')
            for parameter in row:
                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
            json_metadata_now[key] = row

    
    # read standardised file to compare!
    with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'r', encoding='ISO-8859-1') as f:
        json_metadata = json.loads(f.read())

        for station in json_metadata: # loop through all the old stations
            if station in json_metadata_now.keys(): # if station is in current meta data, go on
                for parameter in json_metadata[station]:
                    if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
                        if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
                            # if different value, append the standardised metadeta file
                            print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
                            json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
                            json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
                        else:
                            pass
                    else:
                        print('{} not in new metadata file'.format(parameter))
            else:
                print('Station {} was abolished'.format(station))

        for station in json_metadata_now: # loop through all the new stations
            for parameter in json_metadata_now[station]: # loop through all the parameters
                if station in json_metadata.keys(): # if station is in old meta data
                    pass # comparison was done before
                else: # new station appeared!
                    print('New station {}'.format(station))
                    json_metadata.update({station: json_metadata_now[station]})
                # is there a new parameter that wasn't in the old file?
                if parameter in json_metadata[station].keys():
                    pass # parameter (column) is already there
                else:
                    print('{} is new'.format(parameter))
                    json_metadata[station].update({parameter: json_metadata_now[station][parameter]})


    # safe
    with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(json_metadata, indent=4))"""