RAPHAEL GRODOFZIG · RAPHAEL GRODOFZIG · daff17ba · daff17ba · daff17ba · 9cfc1a3a
--- a/download_scripts/CAPMoN_download.py
+++ b/download_scripts/CAPMoN_download.py
@@ -171,20 +171,21 @@ def download_metadata(n_max_tries, max_time_per_dl):
        print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
    time.sleep(1)

-    """# create json from original metadata file
+    
+    # create json from original metadata file
    json_metadata = {}
    with open('/esarchive/obs/ghost/CAPMoN/metadata/network_provided/CAPMoN_META.csv', 'r', encoding='ISO-8859-1') as file:
        csv_filedata = csv.DictReader(file)

        for row in csv_filedata:  
-            key = row['SiteName_NomDuSite']
+            key = row['ID']+'_'+row['Measurements_Mesures'].replace('"', '')[:3]
            update_date = today.strftime('%Y-%m-%d')
            for parameter in row:
                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
            json_metadata[key] = row

    with open('/esarchive/obs/ghost/CAPMoN/metadata/processed/CAPMoN_META.json', 'w', encoding='utf-8') as f:
-        f.write(json.dumps(json_metadata, indent=4))"""
+        f.write(json.dumps(json_metadata, indent=4))

    
    # create json in desired shape from current metadata file
@@ -193,7 +194,7 @@ def download_metadata(n_max_tries, max_time_per_dl):
        csv_filedata = csv.DictReader(file)

        for row in csv_filedata:  
-            key = row['SiteName_NomDuSite']
+            key = row['ID']+'_'+row['Measurements_Mesures'].replace('"', '')[:3]
            update_date = today.strftime('%Y-%m-%d')
            for parameter in row:
                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
@@ -218,11 +219,18 @@ def download_metadata(n_max_tries, max_time_per_dl):
                print('Station {} was abolished'.format(station))

        for station in json_metadata_now: # loop through all the new stations
+            for parameter in json_metadata_now[station]: # loop through all the parameters
                if station in json_metadata.keys(): # if station is in old meta data
                    pass # comparison was done before
                else: # new station appeared!
                    print('New station {}'.format(station))
                    json_metadata.update({station: json_metadata_now[station]})
+                # is there a new parameter that wasn't in the old file?
+                if parameter in json_metadata[station].keys():
+                    pass # parameter (column) is already there
+                else:
+                    print('{} is new'.format(parameter))
+                    json_metadata[station].update({parameter: json_metadata_now[station][parameter]})


    # safe

--- a/download_scripts/CHILE_SINCA_download.py
+++ b/download_scripts/CHILE_SINCA_download.py
@@ -5,10 +5,11 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support.ui import Select
 from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException
+from selenium.common.exceptions import WebDriverException
 from bs4 import BeautifulSoup


-
 from bs4 import BeautifulSoup
 import requests
 import time
@@ -23,14 +24,16 @@ import zipfile
 import shutil
 import os
 import re
+import csv
+import json


-def scraper(mode, version):
+def download_data(mode, version, n_max_tries, max_time_per_dl):

    # paths and variables
-    variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu'] # complete list later
+    variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu'] 
    variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre']
-    time_resolutions_website = ["registro diario", "registro horario"] # complete later
+    time_resolutions_website = ["registro diario", "registro horario"] 

    variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website!
    time_resolution_ghost = ['daily', 'hourly']
@@ -39,15 +42,28 @@ def scraper(mode, version):

    baseurl = 'https://sinca.mma.gob.cl/index.php/'

-    # only for nrt
-    bdate = "240101"
+    if mode == 'all':
+        # create download directory
+        os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
+        download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)
+
+    elif mode == 'nrt':
+        bdate = date(date.today().year, 1, 1).strftime('%Y%m%d')[2:] #"240101"
        edate = date.today().strftime('%Y%m%d')[2:]
        print(edate)
-
        # create download directory
+        version = mode
        os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
        download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)

+    else:
+        print('time mode inapplicable')
+
+    n_tries = 0
+    errcode = 999
+
+    while (n_tries < n_max_tries) and (errcode != 200):
+        try:
            # set up driver
            options = Options()
            prefs = {'download.default_directory' : download_location}
@@ -57,12 +73,14 @@ def scraper(mode, version):
            options.add_argument("--disable-extensions")
            options.add_argument("--disable-gpu")
            options.add_argument("--disable-dev-shm-usage")
+            if n_tries > 0:
+                options.add_argument("--headless")
+
            svc = webdriver.ChromeService(executable_path=binary_path)
            driver = webdriver.Chrome(service=svc, options=options)

-    # open url
            driver.get(baseurl)
-    WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded
+            WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded

            # navigate to regions
            html = driver.page_source
@@ -145,7 +163,7 @@ def scraper(mode, version):
                            driver.switch_to.frame("right")

                            time.sleep(10)
-                    WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.LINK_TEXT, "Excel CSV"))) # wait till loaded
+                            WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.LINK_TEXT, "Excel CSV"))) # wait till loaded

                            driver.find_element(By.LINK_TEXT, "Excel CSV").click()

@@ -162,43 +180,71 @@ def scraper(mode, version):
                            driver.switch_to.default_content()
                            driver.switch_to.frame("left")
                            i=i+1
+            driver.close()
+            errcode = 200

+        except WebDriverException as e:
+            print(e) 
+            n_tries = n_tries+1
+            print("Number of tries: {}".format(n_tries))
+            continue
+
+    if n_tries == n_max_tries:
+        print('Failed downloading CHILE_SINCA data {} times in {} seconds'.format(n_tries, max_time_per_dl))

-    driver.close()


-def scraper_metadata(mode, version):
+def download_metadata(n_max_tries, max_time_per_dl):
+
+    # paths and variables
+    variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu'] 
+    variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre']
+    time_resolutions_website = ["registro diario", "registro horario"] 
+
+    variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website!
+    time_resolution_ghost = ['daily', 'hourly']

    baseurl = 'https://sinca.mma.gob.cl/index.php/'
+    today = date.today()

-    metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/temp/CHILE_SINCA_META_temp.txt")
-    print(metadata15)

+    metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/temp/CHILE_SINCA_META_temp.txt")
+    #print(metadata15)
+    instruments_old = ['O3_instrument','NO_instrument','NO2_instrument','CO_instrument','CH4_instrument','SO2_instrument','NMHC_instrument','HC_instrument','PM10_instrument','PM2.5_instrument','As_instrument','Cu_instrument','Pb_instrument']
+    """
+    with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'r', encoding='utf-8') as f:
+        metadata_old = json.loads(f.read())
+
+    n_tries = 0
+    errcode = 999
+    metadata_new = {}
+    while (n_tries < n_max_tries) and (errcode != 200):
+        try:
            # set up driver
            options = Options()
            #prefs = {'download.default_directory' : download_location}
            #options.add_experimental_option('prefs', prefs)
            options.add_argument("--no-sandbox")
+            options.add_argument("--headless")
            svc = webdriver.ChromeService(executable_path=binary_path)
            driver = webdriver.Chrome(service=svc, options=options)

            # open url
            driver.get(baseurl)
-    WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded
+            WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded

            # navigate to regions
            html = driver.page_source
            soup = BeautifulSoup(html, features="html.parser")
            regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
-    regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
-    
+            #regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta

            for region in regions:

                print("Region is "+region.getText())

                driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
-        time.sleep(3)
+                time.sleep(1)

                html = driver.page_source
                soup = BeautifulSoup(html, features="html.parser")
@@ -207,8 +253,8 @@ def scraper_metadata(mode, version):

                for station in stations:

-            station_name = station.getText()
-            print(station_name)
+                    station_name_new = station.getText()
+                    print(station_name_new)

                    driver.get("https://sinca.mma.gob.cl/"+station.get("href"))
                    time.sleep(3)
@@ -220,29 +266,121 @@ def scraper_metadata(mode, version):
                    region = soup.find("th", text="Región").find_next_sibling().getText()
                    province = soup.find("th", text="Provincia").find_next_sibling().getText()
                    commune = soup.find("th", text="Comuna").find_next_sibling().getText()
-            UTM_coordinates = soup.find("th", text="Coordenadas UTM").find_next_sibling().getText()
-            timezone = soup.find("th", text="Huso horario").find_next_sibling().getText()
-
-            scraped_metadata = [station_reference, station_name, region, province, commune, UTM_coordinates, timezone]
+                    UTM_coordinates = soup.find("th", text="Coordenadas UTM").find_next_sibling().getText().replace(' ', '')
+                    lon = UTM_coordinates.split('E')[0]+'E'
+                    lat = UTM_coordinates.split('E')[1].split("\n")[0]
+                    timezone = soup.find("th", text="Huso horario").find_next_sibling().getText().replace(' ', '')
+
+                    ins_table = soup.find('table', id="medicion")
+                    if ins_table is not None: # check if there are instruments for air pollution at this station
+                        instruments = ins_table.find_all("td", {"class": "helpTecnica center"})
+                        instruments_per_component = {}
+                    else:
+                        continue

-            metadata15_per_station = metadata15.loc[metadata15["station_name"] == station_name]
-            print(region)
-            print(metadata15_per_station)
-            print(metadata15_per_station["region"].iloc[0])
+                    for instrument in instruments:
+                        component = instrument.find_parent().find('a').getText()
+                        try: # rename
+                            component = variables_ghost[variables_text.index(component)]
+                        except:
+                            try:
+                                component = variables_ghost[variables_website.index(component)]
+                            except:
+                                pass

-            i=0
-            for column in metadata15_per_station.head():
-                print(column)
+                        if 'Ozono.-' in component:
+                            component = 'O3'

-                if metadata15_per_station[column].iloc[0] == scraped_metadata[i]:
-                    print("ok!")
+                        #======
+                        if "No informado" in instrument.getText():
+                            instruments_per_component[component] = ''
                        else:
-                    print("not ok")
+                            instrument_name = re.sub(' +', ' ', instrument.getText())
+                            instrument_name = instrument_name.split("\n")[-1]
+                            instruments_per_component[component] = instrument_name
                    
+                    for station_reference in metadata_old:
+                        if metadata_old[station_reference]['station_name']['values'][0] == station_name_new: # match station with previously referenced station reference from old file
+                            i=0
+                            metadata_new[station_reference] = {} # create inner dictionary
+                            scraped_metadata = [station_reference, station_name_new, region, province, commune, lon, lat, timezone]
+                            for parameter in metadata_old[station_reference]: # loop through the meta parameters
+                                if ("instrument" not in parameter) and ("comments" not in parameter): # go through all that are not instruments
+                                    metadata_new[station_reference][parameter] = {"values": [scraped_metadata[i]], "update_time": [today.strftime('%Y-%m-%d')]}
                                    i=i+1
+                                elif "comments" == parameter:
+                                    metadata_new[station_reference][parameter] = {"values": [''], "update_time": [today.strftime('%Y-%m-%d')]}
+                                else: # go through the instruments
+                                    for component in instruments_per_component:
+                                        if component in parameter:
+                                            metadata_new[station_reference][parameter] = {"values": [instruments_per_component[component]], "update_time": [today.strftime('%Y-%m-%d')]}
+                                        else:
+                                            metadata_new[station_reference][parameter] = {"values": [''], "update_time": [today.strftime('%Y-%m-%d')]}
+                    
+            # safe
+            with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META_{}.json'.format(today.strftime('%Y%m%d')), 'w', encoding='utf-8') as f:
+                f.write(json.dumps(metadata_new, indent=4, ensure_ascii=False))
+
+            driver.close()
+            errcode = 200
+
+        except WebDriverException as e:
+            print(e) 
+            n_tries = n_tries+1
+            print("Number of tries: {}".format(n_tries))
+            continue

+    if n_tries == n_max_tries:
+        print('Failed downloading CHILE_SINCA metadata {} times in {} seconds'.format(n_tries, max_time_per_dl))"""
    

+    """
+    # create json from original metadata file =====================================================================================
+    json_metadata = {}
+    with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META.txt', 'r') as file:
+        csv_filedata = csv.DictReader(file)

+        for row in csv_filedata:  
+            key = row['station_reference']
+            update_date = today.strftime('%Y-%m-%d')
+            for parameter in row:
+                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
+            json_metadata[key] = row

-    driver.close()
+    with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'w', encoding='utf-8') as f:
+        f.write(json.dumps(json_metadata, indent=4))
+    """
+
+    
+    # read newly scraped file
+    with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META_{}.json'.format(today.strftime('%Y%m%d')), 'r', encoding='utf-8') as f:
+        json_metadata_now = json.loads(f.read())
+
+    # read standardised file to compare!
+    with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'r', encoding='utf-8') as f:
+        json_metadata = json.loads(f.read())
+
+        for station in json_metadata: # loop through all the old stations
+            if station in json_metadata_now.keys(): # if station is in current meta data, go on
+                for parameter in json_metadata[station]:
+                    if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
+                        # if different value, append the standardised metadeta file
+                        print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
+                        json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
+                        json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
+                    else:
+                        pass
+            else:
+                print('Station {} was abolished'.format(station))
+
+        for station in json_metadata_now: # loop through all the new stations
+            if station in json_metadata.keys(): # if station is in old meta data
+                pass # comparison was done before
+            else: # new station appeared!
+                print('New station {}'.format(station))
+                json_metadata.update({station: json_metadata_now[station]})
+
+
+    # safe
+    with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'w', encoding='utf-8') as f:
+        f.write(json.dumps(json_metadata, indent=4))
\ No newline at end of file
--- a/download_scripts/EANET_download.py
+++ b/download_scripts/EANET_download.py
@@ -15,14 +15,30 @@ import zipfile

 import os.path
 import os
+import pandas as pd


-def scraper(mode, version):
+def download_data(mode, version, n_max_tries, max_time_per_dl):
    
    url = 'https://monitoring.eanet.asia/document/menu/index#publicData'
-    download_location = "/esarchive/obs/ghost/EANET/original_files/{}/".format(version)
    today = date.today().strftime('%Y%m%d') #+ timedelta(days = 1)).strftime('%Y%m%d') # problem with timezones???
-    #print(today)
+    
+    if mode == 'all':
+        bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts
+        edate = date.today()
+
+        os.makedirs('/esarchive/obs/ghost/EANET/original_files/{}/'.format(version), exist_ok=True)
+        download_location = "/esarchive/obs/ghost/EANET/original_files/{}/".format(version)
+
+
+    elif mode == 'nrt':
+        print("EANET no nrt")
+        exit()
+
+    else:
+        print('time mode inapplicable')
+        exit()
+    

    options = Options()
    prefs = {'download.default_directory' : download_location}
@@ -32,17 +48,17 @@ def scraper(mode, version):
    driver = webdriver.Chrome(service=svc, options=options)

    driver.get(url)
-    time.sleep(2)
+    time.sleep(max_time_per_dl)

    # login
    email = driver.find_element(By.ID, "email") 
    email.send_keys("raphael.grodofzig@bsc.es")
    passwd = driver.find_element(By.ID, "passwd") 
    passwd.send_keys("274s9QZ5")
-    time.sleep(2)
+    time.sleep(max_time_per_dl)
    driver.find_element(By.NAME, "submitBtn").click()

-    time.sleep(3)
+    time.sleep(max_time_per_dl)

    # find countries
    dropdown_element = driver.find_element(By.ID, 'countryCd')
@@ -105,3 +121,108 @@ def scraper(mode, version):
        i=i+1

    driver.close()
+
+
+def download_metadata(n_max_tries, max_time_per_dl):
+
+    url_metadata = 'https://www.eanet.asia/wp-content/uploads/2024/01/Site_Information_Acid_Deposition_Monitoring_NMP2023_1117.xlsm'
+    download_location = "/esarchive/obs/ghost/EANET/metadata/network_provided/EANET_META_{}.csv"
+    Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
+
+    n_tries = 0
+    errcode = 999
+    today = date.today()
+    
+    while (n_tries < n_max_tries) and (errcode != 200):
+        r = requests.get(url_metadata, timeout=max_time_per_dl, headers=Headers)
+        if r.status_code == 200:
+            with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
+                outfile.write(r.content)
+            print('Downloaded metadata')
+            errcode = r.status_code
+        elif r.status_code == 404:
+            print("No metadata found, error 404")
+            errcode = 200
+        else:
+            # try again
+            print('Response error {}, attempt {}'.format(r.status_code, n_tries))
+            errcode = r.status_code
+            n_tries += 1
+            max_time_per_dl = max_time_per_dl*2 # increase waiting time 
+            time.sleep(n_tries ** 2) # wait a lil more every time 
+
+    if n_tries == n_max_tries:
+        print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
+    time.sleep(1)
+
+    metadata = pd.read_excel(download_location.format(today.strftime('%Y%m%d')), engine='pyxlsb').fillna('')
+    print(metadata)
+    """
+    # create json from original metadata file
+    json_metadata = {}
+    with open('/esarchive/obs/ghost/EANET/metadata/network_provided/EANET_META.csv', 'r', encoding='utf-8') as file:
+        csv_filedata = csv.DictReader(file)
+
+        for row in csv_filedata:  
+            key = row['cve_estac']
+            update_date = today.strftime('%Y-%m-%d')
+            for parameter in row:
+                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
+            json_metadata[key] = row
+
+    with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'w', encoding='utf-8') as f:
+        f.write(json.dumps(json_metadata, indent=4))
+        
+    """
+    """
+    # create json in desired shape from current metadata file
+    json_metadata_now = {}  
+    with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
+        csv_filedata = csv.DictReader(file)
+
+        for row in csv_filedata:  
+            key = row['cve_estac']
+            update_date = today.strftime('%Y-%m-%d')
+            for parameter in row:
+                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
+            json_metadata_now[key] = row
+
+    
+    # read standardised file to compare!
+    with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'r', encoding='ISO-8859-1') as f:
+        json_metadata = json.loads(f.read())
+
+        for station in json_metadata: # loop through all the old stations
+            if station in json_metadata_now.keys(): # if station is in current meta data, go on
+                for parameter in json_metadata[station]:
+                    if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
+                        if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
+                            # if different value, append the standardised metadeta file
+                            print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
+                            json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
+                            json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
+                        else:
+                            pass
+                    else:
+                        print('{} not in new metadata file'.format(parameter))
+            else:
+                print('Station {} was abolished'.format(station))
+
+        for station in json_metadata_now: # loop through all the new stations
+            for parameter in json_metadata_now[station]: # loop through all the parameters
+                if station in json_metadata.keys(): # if station is in old meta data
+                    pass # comparison was done before
+                else: # new station appeared!
+                    print('New station {}'.format(station))
+                    json_metadata.update({station: json_metadata_now[station]})
+                # is there a new parameter that wasn't in the old file?
+                if parameter in json_metadata[station].keys():
+                    pass # parameter (column) is already there
+                else:
+                    print('{} is new'.format(parameter))
+                    json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
+
+
+    # safe
+    with open('/esarchive/obs/ghost/EANET/metadata/processed/EANET_META.json', 'w', encoding='utf-8') as f:
+        f.write(json.dumps(json_metadata, indent=4))"""
\ No newline at end of file
--- a/download_scripts/EANET_metadata.py
+++ b/download_scripts/EANET_metadata.py
-import requests
-import time
-from datetime import date
-from datetime import timedelta
-import zipfile
-import urllib
-
-import os.path
-import os
-import pandas as pd
-
-
-def scraper(mode):
-    
-    url = 'https://monitoring.eanet.asia/document/menu/index#publicData'
-    download_url = 'https://www.eanet.asia/wp-content/uploads/2024/01/Site_Information_Acid_Deposition_Monitoring_NMP2023_1117.xlsm'
-    download_location = "/esarchive/obs/ghost/EANET/metadata/network_provided/"
-    today = date.today().strftime('%Y%m%d') #+ timedelta(days = 1)).strftime('%Y%m%d') # problem with timezones???
-
-    """
-    Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
-
-    r = requests.get(download_url, timeout=120, headers=Headers)
-    print(r.status_code)
-    if r.status_code == 200:
-        urllib.request.urlretrieve(url, download_location+"downloaded_metadata.xlsm")
-        print('Downloaded metadata')
-    else:
-        print('url status not ok')"""
-
-    # open file
-    metadata = pd.read_excel(download_location+"downloaded_metadata.xlsm", engine='pyxlsb').fillna('')
-    print(metadata)
--- a/download_scripts/MEXICO_CDMX_download.py
+++ b/download_scripts/MEXICO_CDMX_download.py
@@ -8,20 +8,20 @@ import urllib
 import tarfile
 import shutil
 import gzip
+import csv
+import json



-def scraper(mode, version):
+def download_data(mode, version, n_max_tries, max_time_per_dl):

    base_url = 'http://www.aire.cdmx.gob.mx/opendata/anuales_horarios_gz/contaminantes_{}.csv.gz'

-
    if mode == 'all':
        bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts
        edate = date.today()

        os.makedirs('/esarchive/obs/ghost/MEXICO_CDMX/original_files/{}/'.format(version), exist_ok=True)
-
        download_location = '/esarchive/obs/ghost/MEXICO_CDMX/original_files/'+version+'/contaminantes_{}.csv.gz'

    elif mode == 'nrt':
@@ -39,21 +39,143 @@ def scraper(mode, version):
    # download
    for year in years:
        url = base_url.format(year)
-        r = requests.get(url, timeout=120)
+        n_tries = 0
+        errcode = 999
+
+        while (n_tries < n_max_tries) and (errcode != 200):
+            r = requests.get(url, timeout=max_time_per_dl)
+
            if r.status_code == 200:
                urllib.request.urlretrieve(url, download_location.format(year))
                print('Downloaded {}'.format(url))
-
                        # unzip
                with gzip.open(download_location.format(year), 'rb') as f_in:
                    with open(download_location.format(year)[:-3], 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
-                
-
                # remove files
                os.remove(download_location.format(year))

+                errcode = r.status_code
+            elif r.status_code == 404:
+                print("No data found, error 404, year {}".format(year))
+                errcode = 200
+            elif r.status_code == 403:
+                print("Permission denied for {}".format(year))
+                errcode = 200
            else:
-            print('No {}'.format(url))
+                # try again
+                print('Response error {}, attempt {}'.format(r.status_code, n_tries))
+                errcode = r.status_code
+                n_tries += 1
+                max_time_per_dl = max_time_per_dl*2 # increase waiting time 
+                time.sleep(n_tries ** 2) # wait a lil more every time 
+
+        if n_tries == n_max_tries:
+            print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
+
+        time.sleep(1)
+
+def download_metadata(n_max_tries, max_time_per_dl):

+    url_metadata = 'http://www.aire.cdmx.gob.mx/opendata/catalogos/cat_estacion.csv'
+    download_location = "/esarchive/obs/ghost/MEXICO_CDMX/metadata/network_provided/MEXICO_CDMX_META_{}.csv"
+
+    n_tries = 0
+    errcode = 999
+    today = date.today()
+    
+    while (n_tries < n_max_tries) and (errcode != 200):
+        r = requests.get(url_metadata, timeout=max_time_per_dl)
+        if r.status_code == 200:
+            with open(download_location.format('_unformatted'), 'wb') as outfile:
+                outfile.write(r.content)
+            print('Downloaded metadata')
+            errcode = r.status_code
+        elif r.status_code == 404:
+            print("No metadata found, error 404")
+            errcode = 200
+        else:
+            # try again
+            print('Response error {}, attempt {}'.format(r.status_code, n_tries))
+            errcode = r.status_code
+            n_tries += 1
+            max_time_per_dl = max_time_per_dl*2 # increase waiting time 
+            time.sleep(n_tries ** 2) # wait a lil more every time 
+
+    if n_tries == n_max_tries:
+        print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
    time.sleep(1)
+
+    # import it as pandas to clean header
+    meta_file = pd.read_csv(download_location.format('_unformatted'), header=[1], encoding='ISO-8859-1')
+    meta_file.to_csv(download_location.format(today.strftime('%Y%m%d')), index=False)
+    os.remove(download_location.format('_unformatted'))
+
+    # create json from original metadata file
+    """json_metadata = {}
+    with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/network_provided/MEXICO_CDMX_META.csv', 'r', encoding='utf-8') as file:
+        csv_filedata = csv.DictReader(file)
+
+        for row in csv_filedata:  
+            key = row['cve_estac']
+            update_date = today.strftime('%Y-%m-%d')
+            for parameter in row:
+                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
+            json_metadata[key] = row
+
+    with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/processed/MEXICO_CDMX_META.json', 'w', encoding='utf-8') as f:
+        f.write(json.dumps(json_metadata, indent=4))
+        
+    """
+    
+    # create json in desired shape from current metadata file
+    json_metadata_now = {}  
+    with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
+        csv_filedata = csv.DictReader(file)
+
+        for row in csv_filedata:  
+            key = row['cve_estac']
+            update_date = today.strftime('%Y-%m-%d')
+            for parameter in row:
+                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
+            json_metadata_now[key] = row
+
+    
+    # read standardised file to compare!
+    with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/processed/MEXICO_CDMX_META.json', 'r', encoding='ISO-8859-1') as f:
+        json_metadata = json.loads(f.read())
+
+        for station in json_metadata: # loop through all the old stations
+            if station in json_metadata_now.keys(): # if station is in current meta data, go on
+                for parameter in json_metadata[station]:
+                    if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
+                        if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
+                            # if different value, append the standardised metadeta file
+                            print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
+                            json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
+                            json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
+                        else:
+                            pass
+                    else:
+                        print('{} not in new metadata file'.format(parameter))
+            else:
+                print('Station {} was abolished'.format(station))
+
+        for station in json_metadata_now: # loop through all the new stations
+            for parameter in json_metadata_now[station]: # loop through all the parameters
+                if station in json_metadata.keys(): # if station is in old meta data
+                    pass # comparison was done before
+                else: # new station appeared!
+                    print('New station {}'.format(station))
+                    json_metadata.update({station: json_metadata_now[station]})
+                # is there a new parameter that wasn't in the old file?
+                if parameter in json_metadata[station].keys():
+                    pass # parameter (column) is already there
+                else:
+                    print('{} is new'.format(parameter))
+                    json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
+
+
+    # safe
+    with open('/esarchive/obs/ghost/MEXICO_CDMX/metadata/processed/MEXICO_CDMX_META.json', 'w', encoding='utf-8') as f:
+        f.write(json.dumps(json_metadata, indent=4))
\ No newline at end of file
--- a/download_scripts/MITECO_download.py
+++ b/download_scripts/MITECO_download.py
@@ -22,22 +22,18 @@ from selenium.webdriver.support import expected_conditions as EC



-def scraper(mode, version):
+def download_data(mode, version, n_max_tries, max_time_per_dl):

    baseurl = 'https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos-2001-2021.html'

-
    if mode == 'all':
        bdate = date(2001, 1, 1) #date(1960, 1, 1) # date before record starts
        edate = date.today()
-
        os.makedirs('/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version), exist_ok=True)
-
        download_location = '/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version)

    elif mode == 'nrt':
-        bdate = date(2024, 3, 2) #date.today() - timedelta(days = 1) # if code is run after 2 am, data from previous day will be available
-        edate = date(2024, 3, 3) #date.today() - timedelta(days = 1)
+        print("nrt not available")
        download_location = '/esarchive/obs/ghost/MITECO/original_files/nrt/'

    else:
@@ -70,23 +66,40 @@ def scraper(mode, version):
    for zip_link in zip_links:
        filename = zip_link.get("href").rpartition('/')[-1]
        url = 'https://www.miteco.gob.es/{}'.format(zip_link.get("href"))
+        n_tries = 0
+        errcode = 999

-        r = requests.get(url, timeout=120)
+        while (n_tries < n_max_tries) and (errcode != 200):
+            r = requests.get(url, timeout=max_time_per_dl)
            if r.status_code == 200:
                urllib.request.urlretrieve(url, download_location+filename)
                print('Downloaded {}'.format(filename))
-
                # unzip
                with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
                    zip_ref.extractall(download_location)
-    
                os.remove(download_location+filename)

-        else:
-            print('No {}'.format(url))
+                errcode = r.status_code

-        time.sleep(1)
+            elif r.status_code == 404:
+                print("No data found, error 404")
+                errcode = 200

+            elif r.status_code == 403:
+                print("Permission denied for {}".format(url))
+                errcode = 200
+
+            else:
+                # try again
+                print('Response error {}, attempt {}'.format(r.status_code, n_tries))
+                errcode = r.status_code
+                n_tries += 1
+                max_time_per_dl = max_time_per_dl*2
+                time.sleep(n_tries ** 2) # wait a lil more every time 
+
+        if n_tries == n_max_tries:
+            print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
+        time.sleep(1)

    # go to hyperlinks

@@ -119,7 +132,11 @@ def scraper(mode, version):

                continue
            
-            r = requests.get(url, timeout=120)
+            n_tries = 0
+            errcode = 999
+
+            while (n_tries < n_max_tries) and (errcode != 200):
+                r = requests.get(url, timeout=max_time_per_dl)
                if r.status_code == 200:
                    urllib.request.urlretrieve(url, download_location+filename)
                    print('Downloaded {}'.format(filename))
@@ -129,10 +146,26 @@ def scraper(mode, version):
                        zip_ref.extractall(download_location)
            
                    os.remove(download_location+filename)
+                    errcode = r.status_code

-            else:
-                print('No {}'.format(url))
+                elif r.status_code == 404:
+                    print("No data found, error 404")
+                    errcode = 200
+
+                elif r.status_code == 403:
+                    print("Permission denied for {}".format(url))
+                    errcode = 200

+                else:
+                    # try again
+                    print('Response error {}, attempt {}'.format(r.status_code, n_tries))
+                    errcode = r.status_code
+                    n_tries += 1
+                    max_time_per_dl = max_time_per_dl*2
+                    time.sleep(n_tries ** 2) # wait a lil more every time 
+
+            if n_tries == n_max_tries:
+                print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
            time.sleep(1)

    # delete metadata
@@ -152,3 +185,96 @@ def scraper(mode, version):


    driver.close()
+
+
+def download_metadata(n_max_tries, max_time_per_dl):
+
+    url_metadata = 'https://www.miteco.gob.es/content/dam/miteco/es/calidad-y-evaluacion-ambiental/sgalsi/atm%C3%B3sfera-y-calidad-del-aire/evaluaci%C3%B3n-2022/Metainformacion2022.xlsx'
+    download_location = "/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META_{}.xlsx"
+    Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
+    r = requests.get(url_metadata, headers=Headers, timeout=max_time_per_dl)
+    n_tries = 0
+    errcode = 999
+    today = date.today()
+    
+    while (n_tries < n_max_tries) and (errcode != 200):
+        if r.status_code == 200:
+            with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
+                outfile.write(r.content)
+            print('Downloaded metadata')
+            errcode = r.status_code
+        elif r.status_code == 404:
+            print("No metadata found, error 404")
+            errcode = 200
+        else:
+            # try again
+            print('Response error {}, attempt {}'.format(r.status_code, n_tries))
+            errcode = r.status_code
+            n_tries += 1
+            time.sleep(n_tries ** 2) # wait a lil more every time 
+
+    if n_tries == n_max_tries:
+        print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
+    time.sleep(1)
+
+    # convert to csv
+    file = pd.read_excel(download_location.format(today.strftime('%Y%m%d')))
+    file.to_csv('/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META_{}.csv'.format(today.strftime('%Y%m%d')), index=False, header=True)
+
+    """# create json from original metadata file
+    json_metadata = {}
+    with open('/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META.csv', 'r', encoding='ISO-8859-1') as file:
+        csv_filedata = csv.DictReader(file)
+
+        for row in csv_filedata:  
+            key = row['SiteName_NomDuSite']
+            update_date = today.strftime('%Y-%m-%d')
+            for parameter in row:
+                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
+            json_metadata[key] = row
+
+    with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'w', encoding='utf-8') as f:
+        f.write(json.dumps(json_metadata, indent=4))
+
+    
+    # create json in desired shape from current metadata file
+    json_metadata_now = {}  
+    with open(download_location.format(today.strftime('%Y%m%d')), encoding='ISO-8859-1') as file:
+        csv_filedata = csv.DictReader(file)
+
+        for row in csv_filedata:  
+            key = row['SiteName_NomDuSite']
+            update_date = today.strftime('%Y-%m-%d')
+            for parameter in row:
+                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
+            json_metadata_now[key] = row
+
+    
+    # read standardised file to compare!
+    with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'r', encoding='ISO-8859-1') as f:
+        json_metadata = json.loads(f.read())
+
+        for station in json_metadata: # loop through all the old stations
+            if station in json_metadata_now.keys(): # if station is in current meta data, go on
+                for parameter in json_metadata[station]:
+                    if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
+                        # if different value, append the standardised metadeta file
+                        print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1]), json_metadata_now[station][parameter]['values'][0])
+                        json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
+                        json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
+                    else:
+                        pass
+            else:
+                print('Station {} was abolished'.format(station))
+
+        for station in json_metadata_now: # loop through all the new stations
+            if station in json_metadata.keys(): # if station is in old meta data
+                pass # comparison was done before
+            else: # new station appeared!
+                print('New station {}'.format(station))
+                json_metadata.update({station: json_metadata_now[station]})
+
+
+    # safe
+    with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'w', encoding='ISO-8859-1') as f:
+        f.write(json.dumps(json_metadata, indent=4))"""
--- a/download_scripts/NOAA_ISD_download.py
+++ b/download_scripts/NOAA_ISD_download.py
@@ -7,20 +7,21 @@ import re
 import os
 from datetime import date
 from datetime import timedelta
+import requests
+import csv
+import json
+import time


-def scraper(mode, version):
+def download_data(mode, version, n_max_tries, max_time_per_dl):

    if mode == 'all':
-            
            start_year = 1971
            end_year = 2024

    elif mode == 'nrt':
-
        start_year = date.today().strftime('%Y')
        end_year = (date.today() + timedelta(days=365)).strftime('%Y')
-
        version = mode

    else:
@@ -36,7 +37,7 @@ def scraper(mode, version):
        read_url = False
        while read_url == False:
            try:
-                link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=15, cafile=certifi.where()).read().decode('utf-8-sig'))
+                link_data = re.findall("href=[\"\'](.*?)[\"\']", urlopen(link_url, timeout=max_time_per_dl, cafile=certifi.where()).read().decode('utf-8-sig'))
                read_url = True
            except HTTPError as error:
                print('Data not retrieved because %s\nURL: %s'%(error, link_url))
@@ -57,11 +58,10 @@ def scraper(mode, version):
        #handles issue of server hanging for 3 minutes spoaradically

        #try downloading each link a certain number of times before giving up
-        n_tries_limit = 3
        for link in link_list:
            n_tries = 0
            errcode = 999
-            while (n_tries < n_tries_limit) & (errcode != 0):
+            while (n_tries < n_max_tries) & (errcode != 0):
                if n_tries == 0:
                    print('Checking/Downloading %s'%(link))
                else:
@@ -92,3 +92,102 @@ def scraper(mode, version):
            process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8', shell=True)
            status = process.communicate()[0]
            errcode = process.returncode
+
+def download_metadata(n_max_tries, max_time_per_dl):
+
+    url_metadata = 'https://www.ncei.noaa.gov/pub/data/noaa/isd-history.csv'
+    download_location = "/esarchive/obs/ghost/NOAA_ISD/metadata/network_provided/NOAA_ISD_META_{}.csv"
+
+    n_tries = 0
+    errcode = 999
+    today = date.today()
+    
+    while (n_tries < n_max_tries) and (errcode != 200):
+        r = requests.get(url_metadata, timeout=max_time_per_dl)
+        if r.status_code == 200:
+            with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
+                outfile.write(r.content)
+            print('Downloaded metadata')
+            errcode = r.status_code
+        elif r.status_code == 404:
+            print("No metadata found, error 404")
+            errcode = 200
+        else:
+            # try again
+            print('Response error {}, attempt {}'.format(r.status_code, n_tries))
+            errcode = r.status_code
+            n_tries += 1
+            max_time_per_dl = max_time_per_dl*2 # increase waiting time 
+            time.sleep(n_tries ** 2) # wait a lil more every time 
+
+    if n_tries == n_max_tries:
+        print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
+    time.sleep(1)
+    """
+    # create json from original metadata file
+    json_metadata = {}
+    with open('/esarchive/obs/ghost/NOAA_ISD/metadata/network_provided/NOAA_ISD_META.csv', 'r', encoding='utf-8') as file:
+        csv_filedata = csv.DictReader(file)
+
+        for row in csv_filedata:  
+            key = row['USAF']
+            update_date = today.strftime('%Y-%m-%d')
+            for parameter in row:
+                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
+            json_metadata[key] = row
+
+    with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'w', encoding='utf-8') as f:
+        f.write(json.dumps(json_metadata, indent=4))
+        
+    """
+    
+    # create json in desired shape from current metadata file
+    json_metadata_now = {}  
+    with open(download_location.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
+        csv_filedata = csv.DictReader(file)
+
+        for row in csv_filedata:  
+            key = row['USAF']
+            update_date = today.strftime('%Y-%m-%d')
+            for parameter in row:
+                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
+            json_metadata_now[key] = row
+
+    
+    # read standardised file to compare!
+    with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'r', encoding='utf-8') as f:
+        json_metadata = json.loads(f.read())
+        for station in json_metadata: # loop through all the old stations
+            if station in json_metadata_now.keys(): # if station is in current meta data, go on
+                for parameter in json_metadata[station]:
+                    if parameter in json_metadata_now[station].keys(): # check if column of csv exists in new file
+                        if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
+                            # if different value, append the standardised metadeta file
+                            print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
+                            json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
+                            json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
+                        else:
+                            pass
+                    else:
+                        print('{} not in new metadata file'.format(parameter))
+            else:
+                print('Station {} was abolished'.format(station))
+
+        for station in json_metadata_now: # loop through all the new stations
+            for parameter in json_metadata_now[station]: # loop through all the parameters
+                if station in json_metadata.keys(): # if station is in old meta data
+                    pass # comparison was done before
+                else: # new station appeared!
+                    print('New station {}'.format(station))
+                    json_metadata.update({station: json_metadata_now[station]})
+                # is there a new parameter that wasn't in the old file?
+                if parameter in json_metadata[station].keys():
+                    pass # parameter (column) is already there
+                else:
+                    print('{} is new'.format(parameter))
+                    json_metadata[station].update({parameter: json_metadata_now[station][parameter]})
+
+
+    # safe
+    with open('/esarchive/obs/ghost/NOAA_ISD/metadata/processed/NOAA_ISD_META.json', 'w', encoding='utf-8') as f:
+        f.write(json.dumps(json_metadata, indent=4))
\ No newline at end of file
--- a/download_scripts/US_NADP_AMNet_download.py
+++ b/download_scripts/US_NADP_AMNet_download.py
@@ -47,15 +47,14 @@ def download_data(mode, version, n_max_tries, max_time_per_dl):
    options.add_argument("--no-sandbox")
    #options.add_argument("--headless")

+    svc = webdriver.ChromeService(executable_path=binary_path)
+    driver = webdriver.Chrome(service=svc, options=options)
    n_tries = 0
    errcode = 999

    while (n_tries < n_max_tries) and (errcode != 200):
        
        try:
-            svc = webdriver.ChromeService(executable_path=binary_path)
-            driver = webdriver.Chrome(service=svc, options=options)
-
            # open url
            driver.get(baseurl)
            WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded
@@ -110,9 +109,6 @@ def download_data(mode, version, n_max_tries, max_time_per_dl):
    if n_tries == n_max_tries:
        print('Failed downloading US_NADP_AMNet data {} times in {} seconds'.format(n_tries, max_time_per_dl))

-
-    print(os.path.split(download_location[:-5]))
-
    os.rename("{}AMNET-ALL-h.csv".format(download_location), "/esarchive/obs/ghost/US_NADP_AMNet/original_files/{}/AMNET-ALL-h.csv".format(version))


@@ -229,7 +225,7 @@ def download_metadata(n_max_tries, max_time_per_dl):
                for parameter in json_metadata[station]:
                    if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
                        # if different value, append the standardised metadeta file
-                        print("new {} --- old {}".format(json_metadata_now[station][parameter]['values'][0], json_metadata[station][parameter]['values'][-1]))
+                        print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
                        json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
                        json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
                    else:

--- a/download_scripts/US_NADP_AMoN_download.py
+++ b/download_scripts/US_NADP_AMoN_download.py
@@ -8,9 +8,9 @@ import pandas
 import os.path
 import urllib
 import time
-import ssl 
 import zipfile
-from compare_two_files import compare_files
+import json
+import csv

 from chromedriver_py import binary_path
 from selenium.webdriver.chrome.options import Options
@@ -22,22 +22,16 @@ from selenium.webdriver.support import expected_conditions as EC



-def scraper(mode, version):
+def download_data(mode, version, n_max_tries, max_time_per_dl):

    baseurl = 'https://nadp.slh.wisc.edu/networks/ammonia-monitoring-network/'

    if mode == 'all':
-        bdate = date(2013, 12, 1) #date(1960, 1, 1) # date before record starts
-        edate = date(2024, 1, 1) #date.today() - timedelta(days = 1)
-
-        os.makedirs('/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/'.format(version), exist_ok=True)
-
-        download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/'.format(version)
+        os.makedirs('/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/temp/'.format(version), exist_ok=True)
+        download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/temp/'.format(version)

    elif mode == 'nrt':
-        bdate = date(2024, 1, 1) #date.today() - timedelta(days = 1)
-        edate = date.today() - timedelta(days = 1)
-        download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/nrt/'
+        download_location = '/esarchive/obs/ghost/US_NADP_AMoN/original_files/nrt/temp/'

    else:
        print('time mode inapplicable')
@@ -52,29 +46,30 @@ def scraper(mode, version):

    svc = webdriver.ChromeService(executable_path=binary_path)
    driver = webdriver.Chrome(service=svc, options=options)
+    n_tries = 0
+    errcode = 999

+    while (n_tries < n_max_tries) and (errcode != 200):
+        try:
            # open url
            driver.get(baseurl)
-    WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded
+            WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'sites-list'))) # wait till loaded

            dropdown_element = driver.find_element(By.ID, 'data-type')
            select = Select(dropdown_element)
            options = [opt.get_attribute("text") for opt in select.options]
-    print(options)
            select.select_by_visible_text("Bi-weekly")
-    time.sleep(3)
+            time.sleep(max_time_per_dl)

            dropdown_element = driver.find_element(By.ID, 'sites-list')
            select = Select(dropdown_element)
            options = [opt.get_attribute("text") for opt in select.options]
-    print(options)
            select.select_by_visible_text("All Sites")
-    time.sleep(3)
-
+            time.sleep(max_time_per_dl)

            invalid_box = driver.find_element(By.ID, 'invalid')
            invalid_box.click()
-    time.sleep(3)
+            time.sleep(max_time_per_dl)

            # download
            driver.find_element(By.ID, 'generate-button-text').click()
@@ -85,5 +80,164 @@ def scraper(mode, version):

                if os.path.isfile("{}AMoN-ALL-W-i.csv".format(download_location)):
                    print('AMoN-ALL-W-i.csv download successful')
+                    errcode = 200
+                    continue
+
+        except TimeoutException as e:
+            print(e)
+            max_time_per_dl = max_time_per_dl*2 # set waiting time to double
+            n_tries = n_tries+1
+            print("Number of tries: {}".format(n_tries))
+            continue
+
+        except WebDriverException as e:
+            print(e) 
+            n_tries = n_tries+1
+            print("Number of tries: {}".format(n_tries))
+            continue
+        
+        except:
+            print("Unknown error")
+            n_tries = n_tries+1
+            print("Number of tries: {}".format(n_tries))
+            continue
        
    driver.close()
+
+    if n_tries == n_max_tries:
+        print('Failed downloading US_NADP_AMoN data {} times in {} seconds'.format(n_tries, max_time_per_dl))
+
+    os.rename("{}AMoN-ALL-W-i.csv".format(download_location), "/esarchive/obs/ghost/US_NADP_AMoN/original_files/{}/AMoN-ALL-W-i.csv".format(version))
+
+
+
+
+def download_metadata(n_max_tries, max_time_per_dl):
+
+    baseurl = 'https://nadp.slh.wisc.edu/networks/ammonia-monitoring-network/'
+    #os.makedirs('/esarchive/obs/ghost/US_NADP_AMNet/metadata/network_provided/', exist_ok=True)
+    download_location = '/esarchive/obs/ghost/US_NADP_AMoN/metadata/network_provided/'
+    today = date.today()
+
+    # set up driver
+    options = Options()
+    prefs = {'download.default_directory' : download_location}
+    options.add_experimental_option('prefs', prefs)
+    options.add_argument("--no-sandbox")
+    options.add_argument("--headless")
+
+    n_tries = 0
+    errcode = 999
+    svc = webdriver.ChromeService(executable_path=binary_path)
+    driver = webdriver.Chrome(service=svc, options=options)
+    driver.maximize_window()
+
+    while (n_tries < n_max_tries) and (errcode != 200):
+        try:
+            # open url
+            driver.get(baseurl)
+
+            #WebDriverWait(driver, max_time_per_dl).until(EC.element_to_be_clickable((By.ID, 'invalid'))) # wait till loaded
+            time.sleep(max_time_per_dl)
+            invalid_box = driver.find_element(By.ID, 'download-show-inactive')
+            driver.execute_script("arguments[0].click()", invalid_box)
+            
+            # download
+            #WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.ID, 'generate-button-text'))) # wait till loaded
+            time.sleep(max_time_per_dl)
+            bttn = driver.find_element(By.ID, 'network-data-submit')
+            driver.execute_script("arguments[0].click()", bttn)
+
+
+            # wait until download finished
+            while not os.path.exists(download_location+'amon.csv'):
+                time.sleep(1)
+
+                if os.path.isfile(download_location+'amon.csv'):
+                    print('Amon metadata download successful')
+                    errcode = 200
+                    continue
+
+        except TimeoutException as e:
+            print(e)
+            max_time_per_dl = max_time_per_dl*2 # set waiting time to double
+            n_tries = n_tries+1
+            continue
+        
+        except WebDriverException as e:
+            print(e) 
+            n_tries = n_tries+1
+            print("Number of tries: {}".format(n_tries))
+            continue
+        
+        except:
+            print("Unknown error")
+            max_time_per_dl = max_time_per_dl*2 # set waiting time to double
+            n_tries = n_tries+1
+            continue
+
+    if n_tries == n_max_tries:
+        print('Failed downloading AMoN metadata {} times in {} seconds'.format(n_tries, max_time_per_dl))
+
+    driver.close()
+
+    os.rename(download_location+'amon.csv', download_location+'US_NADP_AMoN_META_{}.csv'.format(today.strftime('%Y%m%d')))
+
+    # create json from original metadata file =====================================================================================
+    """json_metadata = {}
+    with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/network_provided/US_NADP_AMoN_META.csv', 'r') as file:
+        csv_filedata = csv.DictReader(file)
+
+        for row in csv_filedata:  
+            key = row['siteId']
+            update_date = today.strftime('%Y-%m-%d')
+            for parameter in row:
+                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
+            json_metadata[key] = row
+
+    with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/processed/US_NADP_AMoN_META.json', 'w', encoding='utf-8') as f:
+        f.write(json.dumps(json_metadata, indent=4))
+
+    """
+    # create json in desired shape from current metadata file
+    json_metadata_now = {}  
+    with open(download_location+'US_NADP_AMoN_META_{}.csv'.format(today.strftime('%Y%m%d')), encoding='utf-8') as file:
+        csv_filedata = csv.DictReader(file)
+
+        for row in csv_filedata:  
+            key = row['siteId']
+            update_date = today.strftime('%Y-%m-%d')
+            for parameter in row:
+                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
+            json_metadata_now[key] = row
+
+    
+    # read standardised file to compare!
+    with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/processed/US_NADP_AMoN_META.json', 'r', encoding='utf-8') as f:
+        json_metadata = json.loads(f.read())
+
+        for station in json_metadata: # loop through all the old stations
+            if station in json_metadata_now.keys(): # if station is in current meta data, go on
+                for parameter in json_metadata[station]:
+                    if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
+                        # if different value, append the standardised metadeta file
+                        print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
+                        json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
+                        json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
+                    else:
+                        pass
+            else:
+                print('Station {} was abolished'.format(station))
+
+        for station in json_metadata_now: # loop through all the new stations
+            if station in json_metadata.keys(): # if station is in old meta data
+                pass # comparison was done before
+            else: # new station appeared!
+                print('New station {}'.format(station))
+                json_metadata.update({station: json_metadata_now[station]})
+
+
+    # safe
+    with open('/esarchive/obs/ghost/US_NADP_AMoN/metadata/processed/US_NADP_AMoN_META.json', 'w', encoding='utf-8') as f:
+        f.write(json.dumps(json_metadata, indent=4))
+
--- a/download_scripts/__pycache__/CAPMoN_download.cpython-39.pyc
+++ b/download_scripts/__pycache__/CAPMoN_download.cpython-39.pyc
--- a/download_scripts/__pycache__/CHILE_SINCA_download.cpython-39.pyc
+++ b/download_scripts/__pycache__/CHILE_SINCA_download.cpython-39.pyc
--- a/download_scripts/__pycache__/EANET_download.cpython-39.pyc
+++ b/download_scripts/__pycache__/EANET_download.cpython-39.pyc
--- a/download_scripts/__pycache__/MEXICO_CDMX_download.cpython-39.pyc
+++ b/download_scripts/__pycache__/MEXICO_CDMX_download.cpython-39.pyc
--- a/download_scripts/__pycache__/MITECO_download.cpython-39.pyc
+++ b/download_scripts/__pycache__/MITECO_download.cpython-39.pyc
--- a/download_scripts/__pycache__/NOAA_ISD_download.cpython-39.pyc
+++ b/download_scripts/__pycache__/NOAA_ISD_download.cpython-39.pyc
--- a/download_scripts/__pycache__/US_NADP_AMNet_download.cpython-39.pyc
+++ b/download_scripts/__pycache__/US_NADP_AMNet_download.cpython-39.pyc
--- a/download_scripts/__pycache__/US_NADP_AMoN_download.cpython-39.pyc
+++ b/download_scripts/__pycache__/US_NADP_AMoN_download.cpython-39.pyc
--- a/download_scripts/__pycache__/compare_two_files.cpython-39.pyc
+++ b/download_scripts/__pycache__/compare_two_files.cpython-39.pyc
--- a/download_scripts/compare_two_files.py
+++ b/download_scripts/compare_two_files.py
 import hashlib # works for all type of data
+import requests
+
+
+
+def request_download(url, max_time_per_dl, download_location):
+    Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
+    n_tries = 0
+    errcode = 999
+
+    r = requests.get(url, headers=Headers, timeout=max_time_per_dl)
+
+    while (n_tries < n_max_tries) and (errcode != 200):
+        if r.status_code == 200:
+            open(download_location + file.format(year), "wb").write(r.content)
+            print('Downloaded {}'.format(file.format(year)))
+            errcode = r.status_code
+        elif r.status_code == 404:
+            print("No ozone l data found, error 404")
+            errcode = 200
+        elif r.status_code == 403:
+            print("Permission denied for {}".format(file.format(year)))
+            errcode = 200
+        else:
+            # try again
+            print('Response error {}, attempt {}'.format(r.status_code, n_tries))
+            errcode = r.status_code
+            n_tries += 1
+            time.sleep(n_tries ** 2) # wait a lil more every time 
+
+    if n_tries == n_max_tries:
+        print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
+    time.sleep(1)
+
+
+


 # check if files are different

--- a/download_scripts/configure_downloads.py
+++ b/download_scripts/configure_downloads.py
@@ -61,7 +61,13 @@ if __name__ == "__main__":
                        'CNEMC':                    {'max_time_dl': 3},
                        'CANADA_NAPS':              {'max_time_dl': 5},
                        'CAPMoN':                   {'max_time_dl': 5},
-                        'US_NADP_AMNet':            {'max_time_dl': 10}}
+                        'US_NADP_AMNet':            {'max_time_dl': 10},
+                        'US_NADP_AMoN':             {'max_time_dl': 7},
+                        'MEXICO_CDMX':              {'max_time_dl': 10},
+                        'NOAA_ISD':                 {'max_time_dl': 15},
+                        'MITECO':                   {'max_time_dl': 10},
+                        'EANET':                    {'max_time_dl': 5},
+                        'CHILE_SINCA':              {'max_time_dl': 30}}


    # download data
@@ -71,7 +77,7 @@ if __name__ == "__main__":
    dl_metadata = True

    # networks you want to download
-    networks = [US_NADP_AMNet_download]
+    networks = [CHILE_SINCA_download]

    # download all networks
    #networks = ['all']
@@ -107,8 +113,6 @@ if __name__ == "__main__":

        if dl_data == True:
            network.download_data(mode, version, n_max_tries, max_time_per_dl)
-            pass

        if dl_metadata == True:
            network.download_metadata(n_max_tries, max_time_per_dl)
-            pass