CAPMoN_download.py 10.2 KB
Newer Older
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import hashlib
from datetime import date
from datetime import timedelta
import pandas
import os.path
import urllib
import time
import ssl 
import zipfile
from compare_two_files import compare_files
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
import csv
import json
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
def download_data(mode, version, n_max_tries, max_time_per_dl):
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed

    base_url = "https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/monitoring-of-atmospheric-gases/ground-level-ozone/"
    file = "AtmosphericGases-GroundLevelOzone-CAPMoN-AllSites-{}.csv"

    baseurl_ions = 'https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/monitoring-of-atmospheric-precipitation-chemistry/major-ions/AtmosphericPrecipitationChemistry-MajorIons-CAPMoN-AllSites-{}.csv'

    components = ['O3', 'Particulate_Metals']

    if mode == 'all':
        bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts
        edate = date.today() + timedelta(days = 365)
        # create download directory
        for component in components:
            os.makedirs('/esarchive/obs/ghost/CAPMoN/original_files/{}/{}/'.format(version, component), exist_ok=True)
        os.makedirs('/esarchive/obs/ghost/CAPMoN/original_files/{}/precip/major-ions/'.format(version), exist_ok=True)
    elif mode == 'nrt':
        print("No nrt data for CAPMoN network.")
        quit()
    else:
        print('time mode inapplicable')
        quit()

    # create date array, per year
    years = pandas.date_range(bdate, edate, freq='Y').strftime('%Y').tolist()

    print(years)

    # ozone
    download_location = '/esarchive/obs/ghost/CAPMoN/original_files/{}/{}/'.format(version, 'O3')
    download_location_wetdep = '/esarchive/obs/ghost/CAPMoN/original_files/{}/precip/major-ions/'.format(version)

    for year in years:
        url = base_url + file.format(year)
        Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}

RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
        r = requests.get(url, headers=Headers, timeout=max_time_per_dl)
        n_tries = 0
        errcode = 999

        while (n_tries < n_max_tries) and (errcode != 200):
            if r.status_code == 200:
                open(download_location + file.format(year), "wb").write(r.content)
                print('Downloaded {}'.format(file.format(year)))
                errcode = r.status_code
            elif r.status_code == 404:
                print("No ozone l data found, error 404")
                errcode = 200
            elif r.status_code == 403:
                print("Permission denied for {}".format(file.format(year)))
                errcode = 200
            else:
                # try again
                print('Response error {}, attempt {}'.format(r.status_code, n_tries))
                errcode = r.status_code
                n_tries += 1
                time.sleep(n_tries ** 2) # wait a lil more every time 

        if n_tries == n_max_tries:
            print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
        time.sleep(1)

        # major ions in wetdep
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
        res = requests.get(baseurl_ions.format(year), headers=Headers, timeout=max_time_per_dl)
        n_tries = 0
        errcode = 999

        while (n_tries < n_max_tries) and (errcode != 200):
            if res.status_code == 200:
                with open(download_location_wetdep+"precip/major-ions/"+os.path.basename(baseurl_ions.format(year)), 'wb') as outfile:
                    outfile.write(res.content)
                print('Downloaded {}'.format(os.path.basename(baseurl_ions.format(year))))
                errcode = res.status_code
            elif res.status_code == 404:
                print("No major ions data found, error 404 {}".format(baseurl_ions.format(year)))
                errcode = 200
            else:
                # try again
                print('Response error {}, attempt {}'.format(res.status_code, n_tries))
                errcode = res.status_code
                n_tries += 1
                time.sleep(n_tries ** 2) # wait a lil more every time 

        if n_tries == n_max_tries:
            print('Failed downloading {} {} times in {} seconds, error code {}'.format(baseurl_ions.format(year), n_tries, max_time_per_dl, errcode))
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
        time.sleep(1)


RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    # particulate metals
    base_url = "https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/monitoring-of-atmospheric-particles/particulate-metals/"
    file = "AtmosphericParticles-ParticulateMetals-GLBM-MultipleSites-1988_2017.csv"
    download_location = '/esarchive/obs/ghost/CAPMoN/original_files/1.6/Particulate_Metals/'

    url = base_url + file
    Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}

    r = requests.get(url, headers=Headers, timeout=max_time_per_dl)
    n_tries = 0
    errcode = 999
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    while (n_tries < n_max_tries) and (errcode != 200):
        if r.status_code == 200:
            open(download_location + file, "wb").write(r.content)
            print('Downloaded ' + file)
            errcode = r.status_code
        elif r.status_code == 404:
            print("No metal data found, error 404")
            errcode = 200
        elif r.status_code == 403:
            print("Permission denied for {}".format(file))
            errcode = 200
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
        else:
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
            # try again
            print('Response error {}, attempt {}'.format(r.status_code, n_tries))
            errcode = r.status_code
            n_tries += 1
            time.sleep(n_tries ** 2) # wait a lil more every time 
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    if n_tries == n_max_tries:
        print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
    time.sleep(1)
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
def download_metadata(n_max_tries, max_time_per_dl):

    url_metadata = 'https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/national-air-pollution-surveillance-naps-program/ProgramInformation-InformationProgramme/StationsNAPS-StationsSNPA.csv'
    download_location = "/esarchive/obs/ghost/CAPMoN/metadata/network_provided/CAPMoN_META_{}.csv"
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
    r = requests.get(url_metadata, headers=Headers, timeout=max_time_per_dl)
    n_tries = 0
    errcode = 999
    today = date.today()
    
    while (n_tries < n_max_tries) and (errcode != 200):
        if r.status_code == 200:
            with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
                outfile.write(r.content)
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
            print('Downloaded metadata')
            errcode = r.status_code
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
        elif r.status_code == 404:
            print("No metadata found, error 404")
            errcode = 200

        else:
            # try again
            print('Response error {}, attempt {}'.format(r.status_code, n_tries))
            errcode = r.status_code
            n_tries += 1
            time.sleep(n_tries ** 2) # wait a lil more every time 

    if n_tries == n_max_tries:
        print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
    time.sleep(1)

    # create json from original metadata file
    json_metadata = {}
    with open('/esarchive/obs/ghost/CAPMoN/metadata/network_provided/CAPMoN_META.csv', 'r', encoding='ISO-8859-1') as file:
        csv_filedata = csv.DictReader(file)

        for row in csv_filedata:  
            key = row['SiteName_NomDuSite']
            update_date = today.strftime('%Y-%m-%d')
            for parameter in row:
                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
            json_metadata[key] = row

    with open('/esarchive/obs/ghost/CAPMoN/metadata/processed/CAPMoN_META.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(json_metadata, indent=4))

    """
    # create json in desired shape from current metadata file
    json_metadata_now = {}  
    with open(download_location.format(today.strftime('%Y%m%d'))) as file:
        csv_filedata = csv.DictReader(file)

        for row in csv_filedata:  
            key = row['NAPS_ID']
            update_date = today.strftime('%Y-%m-%d')
            for parameter in row:
                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
            json_metadata_now[key] = row

    
    # read standardised file to compare!
    with open('/esarchive/obs/ghost/CANADA_NAPS/metadata/processed/CANADA_NAPS_META.json', 'r', encoding='utf-8') as f:
        json_metadata = json.loads(f.read())

        for station in json_metadata: # loop through all the old stations
            if station in json_metadata_now.keys(): # if station is in current meta data, go on
                for parameter in json_metadata[station]:
                    if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
                        # if different value, append the standardised metadeta file
                        print("new {} --- old {}".format(json_metadata_now[station][parameter]['values'][0], json_metadata[station][parameter]['values'][-1]))
                        json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
                        json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
                    else:
                        pass
            else:
                print('Station {} was abolished'.format(station))

        for station in json_metadata_now: # loop through all the new stations
            if station in json_metadata.keys(): # if station is in old meta data
                pass # comparison was done before
            else: # new station appeared!
                print('New station {}'.format(station))
                json_metadata.update({station: json_metadata_now[station]})


    # safe
    with open('/esarchive/obs/ghost/CANADA_NAPS/metadata/processed/CANADA_NAPS_META.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(json_metadata, indent=4))"""