CAPMoN_download.py 10.3 KB
Newer Older
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import hashlib
from datetime import date
from datetime import timedelta
import pandas
import os.path
import urllib
import time
import ssl 
import zipfile
from compare_two_files import compare_files
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
import csv
import json
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
def download_data(mode, version, n_max_tries, max_time_per_dl):
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed

    base_url = "https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/monitoring-of-atmospheric-gases/ground-level-ozone/"
    file = "AtmosphericGases-GroundLevelOzone-CAPMoN-AllSites-{}.csv"

    baseurl_ions = 'https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/monitoring-of-atmospheric-precipitation-chemistry/major-ions/AtmosphericPrecipitationChemistry-MajorIons-CAPMoN-AllSites-{}.csv'

    components = ['O3', 'Particulate_Metals']

    if mode == 'all':
        bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts
        edate = date.today() + timedelta(days = 365)
        # create download directory
        for component in components:
            os.makedirs('/esarchive/obs/ghost/CAPMoN/original_files/{}/{}/'.format(version, component), exist_ok=True)
        os.makedirs('/esarchive/obs/ghost/CAPMoN/original_files/{}/precip/major-ions/'.format(version), exist_ok=True)
    elif mode == 'nrt':
        print("No nrt data for CAPMoN network.")
        quit()
    else:
        print('time mode inapplicable')
        quit()

    # create date array, per year
    years = pandas.date_range(bdate, edate, freq='Y').strftime('%Y').tolist()

    print(years)

    # ozone
    download_location = '/esarchive/obs/ghost/CAPMoN/original_files/{}/{}/'.format(version, 'O3')
    download_location_wetdep = '/esarchive/obs/ghost/CAPMoN/original_files/{}/precip/major-ions/'.format(version)

    for year in years:
        url = base_url + file.format(year)
        Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}

RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
        r = requests.get(url, headers=Headers, timeout=max_time_per_dl)
        n_tries = 0
        errcode = 999

        while (n_tries < n_max_tries) and (errcode != 200):
            if r.status_code == 200:
                open(download_location + file.format(year), "wb").write(r.content)
                print('Downloaded {}'.format(file.format(year)))
                errcode = r.status_code
            elif r.status_code == 404:
                print("No ozone l data found, error 404")
                errcode = 200
            elif r.status_code == 403:
                print("Permission denied for {}".format(file.format(year)))
                errcode = 200
            else:
                # try again
                print('Response error {}, attempt {}'.format(r.status_code, n_tries))
                errcode = r.status_code
                n_tries += 1
                time.sleep(n_tries ** 2) # wait a lil more every time 

        if n_tries == n_max_tries:
            print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
        time.sleep(1)

        # major ions in wetdep
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
        res = requests.get(baseurl_ions.format(year), headers=Headers, timeout=max_time_per_dl)
        n_tries = 0
        errcode = 999

        while (n_tries < n_max_tries) and (errcode != 200):
            if res.status_code == 200:
                with open(download_location_wetdep+"precip/major-ions/"+os.path.basename(baseurl_ions.format(year)), 'wb') as outfile:
                    outfile.write(res.content)
                print('Downloaded {}'.format(os.path.basename(baseurl_ions.format(year))))
                errcode = res.status_code
            elif res.status_code == 404:
                print("No major ions data found, error 404 {}".format(baseurl_ions.format(year)))
                errcode = 200
            else:
                # try again
                print('Response error {}, attempt {}'.format(res.status_code, n_tries))
                errcode = res.status_code
                n_tries += 1
                time.sleep(n_tries ** 2) # wait a lil more every time 

        if n_tries == n_max_tries:
            print('Failed downloading {} {} times in {} seconds, error code {}'.format(baseurl_ions.format(year), n_tries, max_time_per_dl, errcode))
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
        time.sleep(1)


RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    # particulate metals
    base_url = "https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/monitoring-of-atmospheric-particles/particulate-metals/"
    file = "AtmosphericParticles-ParticulateMetals-GLBM-MultipleSites-1988_2017.csv"
    download_location = '/esarchive/obs/ghost/CAPMoN/original_files/1.6/Particulate_Metals/'

    url = base_url + file
    Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}

    r = requests.get(url, headers=Headers, timeout=max_time_per_dl)
    n_tries = 0
    errcode = 999
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    while (n_tries < n_max_tries) and (errcode != 200):
        if r.status_code == 200:
            open(download_location + file, "wb").write(r.content)
            print('Downloaded ' + file)
            errcode = r.status_code
        elif r.status_code == 404:
            print("No metal data found, error 404")
            errcode = 200
        elif r.status_code == 403:
            print("Permission denied for {}".format(file))
            errcode = 200
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
        else:
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
            # try again
            print('Response error {}, attempt {}'.format(r.status_code, n_tries))
            errcode = r.status_code
            n_tries += 1
            time.sleep(n_tries ** 2) # wait a lil more every time 
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    if n_tries == n_max_tries:
        print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
    time.sleep(1)
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
def download_metadata(n_max_tries, max_time_per_dl):

RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    url_metadata = 'https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/networks-and-studies/canadian-air-and-precipitation-monitoring-network-capmon/Networks_Studies-Reseaux_etudes-CAPMoN-SiteListing-ListeDesSites_EN-FR.csv'
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    download_location = "/esarchive/obs/ghost/CAPMoN/metadata/network_provided/CAPMoN_META_{}.csv"
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    r = requests.get(url_metadata, headers=Headers, timeout=max_time_per_dl)
    n_tries = 0
    errcode = 999
    today = date.today()
    
    while (n_tries < n_max_tries) and (errcode != 200):
        if r.status_code == 200:
            with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
                outfile.write(r.content)
            print('Downloaded metadata')
            errcode = r.status_code
        elif r.status_code == 404:
            print("No metadata found, error 404")
            errcode = 200
        else:
            # try again
            print('Response error {}, attempt {}'.format(r.status_code, n_tries))
            errcode = r.status_code
            n_tries += 1
            time.sleep(n_tries ** 2) # wait a lil more every time 

    if n_tries == n_max_tries:
        print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
    time.sleep(1)

RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    """# create json from original metadata file
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    json_metadata = {}
    with open('/esarchive/obs/ghost/CAPMoN/metadata/network_provided/CAPMoN_META.csv', 'r', encoding='ISO-8859-1') as file:
        csv_filedata = csv.DictReader(file)

        for row in csv_filedata:  
            key = row['SiteName_NomDuSite']
            update_date = today.strftime('%Y-%m-%d')
            for parameter in row:
                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
            json_metadata[key] = row

    with open('/esarchive/obs/ghost/CAPMoN/metadata/processed/CAPMoN_META.json', 'w', encoding='utf-8') as f:
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
        f.write(json.dumps(json_metadata, indent=4))"""
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    # create json in desired shape from current metadata file
    json_metadata_now = {}  
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    with open(download_location.format(today.strftime('%Y%m%d')), encoding='ISO-8859-1') as file:
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
        csv_filedata = csv.DictReader(file)

        for row in csv_filedata:  
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
            key = row['SiteName_NomDuSite']
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
            update_date = today.strftime('%Y-%m-%d')
            for parameter in row:
                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
            json_metadata_now[key] = row

    
    # read standardised file to compare!
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    with open('/esarchive/obs/ghost/CAPMoN/metadata/processed/CAPMoN_META.json', 'r', encoding='ISO-8859-1') as f:
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
        json_metadata = json.loads(f.read())

        for station in json_metadata: # loop through all the old stations
            if station in json_metadata_now.keys(): # if station is in current meta data, go on
                for parameter in json_metadata[station]:
                    if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
                        # if different value, append the standardised metadeta file
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                        print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1]), json_metadata_now[station][parameter]['values'][0])
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
                        json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
                        json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
                    else:
                        pass
            else:
                print('Station {} was abolished'.format(station))

        for station in json_metadata_now: # loop through all the new stations
            if station in json_metadata.keys(): # if station is in old meta data
                pass # comparison was done before
            else: # new station appeared!
                print('New station {}'.format(station))
                json_metadata.update({station: json_metadata_now[station]})


    # safe
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    with open('/esarchive/obs/ghost/CAPMoN/metadata/processed/CAPMoN_META.json', 'w', encoding='ISO-8859-1') as f:
        f.write(json.dumps(json_metadata, indent=4))