EANET_download.py 9.4 KB
Newer Older
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from chromedriver_py import binary_path
from selenium.webdriver.chrome.options import Options
import requests
import time
from datetime import date
from datetime import timedelta
import zipfile

import os.path
import os
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
import pandas as pd
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
def download_data(mode, version, n_max_tries, max_time_per_dl):
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    
    url = 'https://monitoring.eanet.asia/document/menu/index#publicData'
    today = date.today().strftime('%Y%m%d') #+ timedelta(days = 1)).strftime('%Y%m%d') # problem with timezones???
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    
    if mode == 'all':
        bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts
        edate = date.today()

        os.makedirs('/esarchive/obs/ghost/EANET/original_files/{}/'.format(version), exist_ok=True)
        download_location = "/esarchive/obs/ghost/EANET/original_files/{}/".format(version)


    elif mode == 'nrt':
        print("EANET no nrt")
        exit()

    else:
        print('time mode inapplicable')
        exit()
    
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed

    options = Options()
    prefs = {'download.default_directory' : download_location}
    options.add_experimental_option('prefs', prefs)
    options.add_argument("--no-sandbox")
    svc = webdriver.ChromeService(executable_path=binary_path)
    driver = webdriver.Chrome(service=svc, options=options)

    driver.get(url)
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    time.sleep(max_time_per_dl)
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed

    # login
    email = driver.find_element(By.ID, "email") 
    email.send_keys("raphael.grodofzig@bsc.es")
    passwd = driver.find_element(By.ID, "passwd") 
    passwd.send_keys("274s9QZ5")
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    time.sleep(max_time_per_dl)
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    driver.find_element(By.NAME, "submitBtn").click()

RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    time.sleep(max_time_per_dl)
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed

    # find countries
    dropdown_element = driver.find_element(By.ID, 'countryCd')
    select = Select(dropdown_element)
    countries = []
    COUNTRIES = []
    for country in select.options:
        if country.text == "-":
            pass
        else:
            countries.append(str(country.text))

            if country.text == "Korea, Republic of":
                COUNTRIES.append("SOUTH_KOREA")
            elif country.text == "Lao People's Democratic Republic":
                COUNTRIES.append("LAOS")
            elif country.text == "Viet Nam":
                COUNTRIES.append("VIETNAM")
            else:
                COUNTRIES.append(str(country.text).upper())

    print(countries)
    print(COUNTRIES)
 
    #countries = ["Cambodia", "China"]
    #COUNTRIES = ["CAMBODIA", "CHINA"]

    # download
    i=0
    for country in countries:
        # create download directory
        os.makedirs(download_location+COUNTRIES[i]+'/', exist_ok=True)

        dropdown_element = driver.find_element(By.ID, 'countryCd')
        select = Select(dropdown_element)
        select.select_by_visible_text(country)
        
        print("successfully selected {}".format(country))

        # start download
        driver.find_element(By.CSS_SELECTOR, "input[id='downloadBtn'][type='button']").click()

        # wait until download finished
        while not os.path.exists("{}/detail_{}.zip".format(download_location+COUNTRIES[i], today)):
            time.sleep(1)

            if os.path.isfile("{}/detail_{}.zip".format(download_location, today)):
                print('{} download successful'.format(country))

                # move to country directory
                os.rename("{}/detail_{}.zip".format(download_location, today), "{}/detail_{}.zip".format(download_location+COUNTRIES[i], today))

                # unzip
                with zipfile.ZipFile("{}/detail_{}.zip".format(download_location+COUNTRIES[i], today), 'r') as zip_ref:
                    zip_ref.extractall(os.path.dirname("{}/detail_{}.zip".format(download_location+COUNTRIES[i], today)))


        os.remove("{}/detail_{}.zip".format(download_location+COUNTRIES[i], today))

        i=i+1

RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    driver.close()


def download_metadata(n_max_tries, max_time_per_dl):

    url_metadata = 'https://www.eanet.asia/wp-content/uploads/2024/01/Site_Information_Acid_Deposition_Monitoring_NMP2023_1117.xlsm'
    download_location = "/esarchive/obs/ghost/EANET/metadata/network_provided/EANET_META_{}.csv"
    Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}

    n_tries = 0
    errcode = 999
    today = date.today()
    
    while (n_tries < n_max_tries) and (errcode != 200):
        r = requests.get(url_metadata, timeout=max_time_per_dl, headers=Headers)
        if r.status_code == 200:
            with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
                outfile.write(r.content)
            print('Downloaded metadata')
            errcode = r.status_code
        elif r.status_code == 404:
            print("No metadata found, error 404")
            errcode = 200
        else:
            # try again
            print('Response error {}, attempt {}'.format(r.status_code, n_tries))
            errcode = r.status_code
            n_tries += 1
            max_time_per_dl = max_time_per_dl*2 # increase waiting time 
            time.sleep(n_tries ** 2) # wait a lil more every time 

    if n_tries == n_max_tries:
        print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
    time.sleep(1)

    metadata = pd.read_excel(download_location.format(today.strftime('%Y%m%d')), engine='pyxlsb').fillna('')
    print(metadata)
    """
    # create json from original metadata file
    json_metadata = {}
    with open('/esarchive/obs/ghost/EANET/metadata/network_provided/EANET_META.csv', 'r', encoding='utf-8') as file:
        csv_filedata = csv.DictReader(file)

        for row in csv_filedata:  
            key = row['cve_estac']
            update_date = today.strftime('%Y-%m-%d')
            for parameter in row:
                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
            json_metadata[key] = row

    with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(json_metadata, indent=4))
        
    """
    """
    # create json in desired shape from current metadata file
    json_metadata_now = {}  
    with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
        csv_filedata = csv.DictReader(file)

        for row in csv_filedata:  
            key = row['cve_estac']
            update_date = today.strftime('%Y-%m-%d')
            for parameter in row:
                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
            json_metadata_now[key] = row

    
    # read standardised file to compare!
    with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'r', encoding='ISO-8859-1') as f:
        json_metadata = json.loads(f.read())

        for station in json_metadata: # loop through all the old stations
            if station in json_metadata_now.keys(): # if station is in current meta data, go on
                for parameter in json_metadata[station]:
                    if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
                        if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
                            # if different value, append the standardised metadeta file
                            print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
                            json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
                            json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
                        else:
                            pass
                    else:
                        print('{} not in new metadata file'.format(parameter))
            else:
                print('Station {} was abolished'.format(station))

        for station in json_metadata_now: # loop through all the new stations
            for parameter in json_metadata_now[station]: # loop through all the parameters
                if station in json_metadata.keys(): # if station is in old meta data
                    pass # comparison was done before
                else: # new station appeared!
                    print('New station {}'.format(station))
                    json_metadata.update({station: json_metadata_now[station]})
                # is there a new parameter that wasn't in the old file?
                if parameter in json_metadata[station].keys():
                    pass # parameter (column) is already there
                else:
                    print('{} is new'.format(parameter))
                    json_metadata[station].update({parameter: json_metadata_now[station][parameter]})


    # safe
    with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(json_metadata, indent=4))"""