CAPMoN_download.py

from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import hashlib
from datetime import date
from datetime import timedelta
import pandas
import os.path
import urllib
import time
import ssl 
import zipfile
from compare_two_files import compare_files
import csv
import json


def download_data(mode, version, n_max_tries, max_time_per_dl):

    base_url = "https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/monitoring-of-atmospheric-gases/ground-level-ozone/"
    file = "AtmosphericGases-GroundLevelOzone-CAPMoN-AllSites-{}.csv"

    baseurl_ions = 'https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/monitoring-of-atmospheric-precipitation-chemistry/major-ions/AtmosphericPrecipitationChemistry-MajorIons-CAPMoN-AllSites-{}.csv'

    components = ['O3', 'Particulate_Metals']

    if mode == 'all':
        bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts
        edate = date.today() + timedelta(days = 365)
        # create download directory
        for component in components:
            os.makedirs('/esarchive/obs/ghost/CAPMoN/original_files/{}/{}/'.format(version, component), exist_ok=True)
        os.makedirs('/esarchive/obs/ghost/CAPMoN/original_files/{}/precip/major-ions/'.format(version), exist_ok=True)
    elif mode == 'nrt':
        print("No nrt data for CAPMoN network.")
        quit()
    else:
        print('time mode inapplicable')
        quit()

    # create date array, per year
    years = pandas.date_range(bdate, edate, freq='Y').strftime('%Y').tolist()

    print(years)

    # ozone
    download_location = '/esarchive/obs/ghost/CAPMoN/original_files/{}/{}/'.format(version, 'O3')
    download_location_wetdep = '/esarchive/obs/ghost/CAPMoN/original_files/{}/precip/major-ions/'.format(version)

    for year in years:
        url = base_url + file.format(year)
        Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}

        r = requests.get(url, headers=Headers, timeout=max_time_per_dl)
        n_tries = 0
        errcode = 999

        while (n_tries < n_max_tries) and (errcode != 200):
            if r.status_code == 200:
                open(download_location + file.format(year), "wb").write(r.content)
                print('Downloaded {}'.format(file.format(year)))
                errcode = r.status_code
            elif r.status_code == 404:
                print("No ozone l data found, error 404")
                errcode = 200
            elif r.status_code == 403:
                print("Permission denied for {}".format(file.format(year)))
                errcode = 200
            else:
                # try again
                print('Response error {}, attempt {}'.format(r.status_code, n_tries))
                errcode = r.status_code
                n_tries += 1
                time.sleep(n_tries ** 2) # wait a lil more every time 

        if n_tries == n_max_tries:
            print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
        time.sleep(1)

        # major ions in wetdep
        res = requests.get(baseurl_ions.format(year), headers=Headers, timeout=max_time_per_dl)
        n_tries = 0
        errcode = 999

        while (n_tries < n_max_tries) and (errcode != 200):
            if res.status_code == 200:
                with open(download_location_wetdep+"precip/major-ions/"+os.path.basename(baseurl_ions.format(year)), 'wb') as outfile:
                    outfile.write(res.content)
                print('Downloaded {}'.format(os.path.basename(baseurl_ions.format(year))))
                errcode = res.status_code
            elif res.status_code == 404:
                print("No major ions data found, error 404 {}".format(baseurl_ions.format(year)))
                errcode = 200
            else:
                # try again
                print('Response error {}, attempt {}'.format(res.status_code, n_tries))
                errcode = res.status_code
                n_tries += 1
                time.sleep(n_tries ** 2) # wait a lil more every time 

        if n_tries == n_max_tries:
            print('Failed downloading {} {} times in {} seconds, error code {}'.format(baseurl_ions.format(year), n_tries, max_time_per_dl, errcode))
        time.sleep(1)


    # particulate metals
    base_url = "https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/monitoring-of-atmospheric-particles/particulate-metals/"
    file = "AtmosphericParticles-ParticulateMetals-GLBM-MultipleSites-1988_2017.csv"
    download_location = '/esarchive/obs/ghost/CAPMoN/original_files/1.6/Particulate_Metals/'

    url = base_url + file
    Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}

    r = requests.get(url, headers=Headers, timeout=max_time_per_dl)
    n_tries = 0
    errcode = 999

    while (n_tries < n_max_tries) and (errcode != 200):
        if r.status_code == 200:
            open(download_location + file, "wb").write(r.content)
            print('Downloaded ' + file)
            errcode = r.status_code
        elif r.status_code == 404:
            print("No metal data found, error 404")
            errcode = 200
        elif r.status_code == 403:
            print("Permission denied for {}".format(file))
            errcode = 200
        else:
            # try again
            print('Response error {}, attempt {}'.format(r.status_code, n_tries))
            errcode = r.status_code
            n_tries += 1
            time.sleep(n_tries ** 2) # wait a lil more every time 

    if n_tries == n_max_tries:
        print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode))
    time.sleep(1)


def download_metadata(n_max_tries, max_time_per_dl):

    url_metadata = 'https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/national-air-pollution-surveillance-naps-program/ProgramInformation-InformationProgramme/StationsNAPS-StationsSNPA.csv'
    download_location = "/esarchive/obs/ghost/CAPMoN/metadata/network_provided/CAPMoN_META_{}.csv"
    Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
    r = requests.get(url_metadata, headers=Headers, timeout=max_time_per_dl)
    n_tries = 0
    errcode = 999
    today = date.today()
    
    while (n_tries < n_max_tries) and (errcode != 200):
        if r.status_code == 200:
            with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile:
                outfile.write(r.content)

            print('Downloaded metadata')
            errcode = r.status_code

        elif r.status_code == 404:
            print("No metadata found, error 404")
            errcode = 200

        else:
            # try again
            print('Response error {}, attempt {}'.format(r.status_code, n_tries))
            errcode = r.status_code
            n_tries += 1
            time.sleep(n_tries ** 2) # wait a lil more every time 

    if n_tries == n_max_tries:
        print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode))
    time.sleep(1)

    # create json from original metadata file
    json_metadata = {}
    with open('/esarchive/obs/ghost/CAPMoN/metadata/network_provided/CAPMoN_META.csv', 'r', encoding='ISO-8859-1') as file:
        csv_filedata = csv.DictReader(file)

        for row in csv_filedata:  
            key = row['SiteName_NomDuSite']
            update_date = today.strftime('%Y-%m-%d')
            for parameter in row:
                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
            json_metadata[key] = row

    with open('/esarchive/obs/ghost/CAPMoN/metadata/processed/CAPMoN_META.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(json_metadata, indent=4))

    """
    # create json in desired shape from current metadata file
    json_metadata_now = {}  
    with open(download_location.format(today.strftime('%Y%m%d'))) as file:
        csv_filedata = csv.DictReader(file)

        for row in csv_filedata:  
            key = row['NAPS_ID']
            update_date = today.strftime('%Y-%m-%d')
            for parameter in row:
                row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
            json_metadata_now[key] = row

    
    # read standardised file to compare!
    with open('/esarchive/obs/ghost/CANADA_NAPS/metadata/processed/CANADA_NAPS_META.json', 'r', encoding='utf-8') as f:
        json_metadata = json.loads(f.read())

        for station in json_metadata: # loop through all the old stations
            if station in json_metadata_now.keys(): # if station is in current meta data, go on
                for parameter in json_metadata[station]:
                    if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
                        # if different value, append the standardised metadeta file
                        print("new {} --- old {}".format(json_metadata_now[station][parameter]['values'][0], json_metadata[station][parameter]['values'][-1]))
                        json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
                        json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
                    else:
                        pass
            else:
                print('Station {} was abolished'.format(station))

        for station in json_metadata_now: # loop through all the new stations
            if station in json_metadata.keys(): # if station is in old meta data
                pass # comparison was done before
            else: # new station appeared!
                print('New station {}'.format(station))
                json_metadata.update({station: json_metadata_now[station]})


    # safe
    with open('/esarchive/obs/ghost/CANADA_NAPS/metadata/processed/CANADA_NAPS_META.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(json_metadata, indent=4))"""