from selenium import webdriver from bs4 import BeautifulSoup import requests import hashlib from datetime import date from datetime import timedelta import pandas import os.path import urllib import time import ssl import zipfile from compare_two_files import compare_files import csv import json def download_data(mode, version, n_max_tries, max_time_per_dl): base_url = "https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/monitoring-of-atmospheric-gases/ground-level-ozone/" file = "AtmosphericGases-GroundLevelOzone-CAPMoN-AllSites-{}.csv" baseurl_ions = 'https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/monitoring-of-atmospheric-precipitation-chemistry/major-ions/AtmosphericPrecipitationChemistry-MajorIons-CAPMoN-AllSites-{}.csv' components = ['O3', 'Particulate_Metals'] if mode == 'all': bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts edate = date.today() + timedelta(days = 365) # create download directory for component in components: os.makedirs('/esarchive/obs/ghost/CAPMoN/original_files/{}/{}/'.format(version, component), exist_ok=True) os.makedirs('/esarchive/obs/ghost/CAPMoN/original_files/{}/precip/major-ions/'.format(version), exist_ok=True) elif mode == 'nrt': print("No nrt data for CAPMoN network.") quit() else: print('time mode inapplicable') quit() # create date array, per year years = pandas.date_range(bdate, edate, freq='Y').strftime('%Y').tolist() print(years) # ozone download_location = '/esarchive/obs/ghost/CAPMoN/original_files/{}/{}/'.format(version, 'O3') download_location_wetdep = '/esarchive/obs/ghost/CAPMoN/original_files/{}/precip/major-ions/'.format(version) for year in years: url = base_url + file.format(year) Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'} r = requests.get(url, headers=Headers, timeout=max_time_per_dl) n_tries = 0 errcode = 999 while (n_tries < n_max_tries) and (errcode != 200): if r.status_code == 200: open(download_location + file.format(year), "wb").write(r.content) print('Downloaded {}'.format(file.format(year))) errcode = r.status_code elif r.status_code == 404: print("No ozone l data found, error 404") errcode = 200 elif r.status_code == 403: print("Permission denied for {}".format(file.format(year))) errcode = 200 else: # try again print('Response error {}, attempt {}'.format(r.status_code, n_tries)) errcode = r.status_code n_tries += 1 time.sleep(n_tries ** 2) # wait a lil more every time if n_tries == n_max_tries: print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode)) time.sleep(1) # major ions in wetdep res = requests.get(baseurl_ions.format(year), headers=Headers, timeout=max_time_per_dl) n_tries = 0 errcode = 999 while (n_tries < n_max_tries) and (errcode != 200): if res.status_code == 200: with open(download_location_wetdep+"precip/major-ions/"+os.path.basename(baseurl_ions.format(year)), 'wb') as outfile: outfile.write(res.content) print('Downloaded {}'.format(os.path.basename(baseurl_ions.format(year)))) errcode = res.status_code elif res.status_code == 404: print("No major ions data found, error 404 {}".format(baseurl_ions.format(year))) errcode = 200 else: # try again print('Response error {}, attempt {}'.format(res.status_code, n_tries)) errcode = res.status_code n_tries += 1 time.sleep(n_tries ** 2) # wait a lil more every time if n_tries == n_max_tries: print('Failed downloading {} {} times in {} seconds, error code {}'.format(baseurl_ions.format(year), n_tries, max_time_per_dl, errcode)) time.sleep(1) # particulate metals base_url = "https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/monitoring-of-atmospheric-particles/particulate-metals/" file = "AtmosphericParticles-ParticulateMetals-GLBM-MultipleSites-1988_2017.csv" download_location = '/esarchive/obs/ghost/CAPMoN/original_files/1.6/Particulate_Metals/' url = base_url + file Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'} r = requests.get(url, headers=Headers, timeout=max_time_per_dl) n_tries = 0 errcode = 999 while (n_tries < n_max_tries) and (errcode != 200): if r.status_code == 200: open(download_location + file, "wb").write(r.content) print('Downloaded ' + file) errcode = r.status_code elif r.status_code == 404: print("No metal data found, error 404") errcode = 200 elif r.status_code == 403: print("Permission denied for {}".format(file)) errcode = 200 else: # try again print('Response error {}, attempt {}'.format(r.status_code, n_tries)) errcode = r.status_code n_tries += 1 time.sleep(n_tries ** 2) # wait a lil more every time if n_tries == n_max_tries: print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode)) time.sleep(1) def download_metadata(n_max_tries, max_time_per_dl): url_metadata = 'https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/national-air-pollution-surveillance-naps-program/ProgramInformation-InformationProgramme/StationsNAPS-StationsSNPA.csv' download_location = "/esarchive/obs/ghost/CAPMoN/metadata/network_provided/CAPMoN_META_{}.csv" Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'} r = requests.get(url_metadata, headers=Headers, timeout=max_time_per_dl) n_tries = 0 errcode = 999 today = date.today() while (n_tries < n_max_tries) and (errcode != 200): if r.status_code == 200: with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile: outfile.write(r.content) print('Downloaded metadata') errcode = r.status_code elif r.status_code == 404: print("No metadata found, error 404") errcode = 200 else: # try again print('Response error {}, attempt {}'.format(r.status_code, n_tries)) errcode = r.status_code n_tries += 1 time.sleep(n_tries ** 2) # wait a lil more every time if n_tries == n_max_tries: print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode)) time.sleep(1) # create json from original metadata file json_metadata = {} with open('/esarchive/obs/ghost/CAPMoN/metadata/network_provided/CAPMoN_META.csv', 'r', encoding='ISO-8859-1') as file: csv_filedata = csv.DictReader(file) for row in csv_filedata: key = row['SiteName_NomDuSite'] update_date = today.strftime('%Y-%m-%d') for parameter in row: row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter json_metadata[key] = row with open('/esarchive/obs/ghost/CAPMoN/metadata/processed/CAPMoN_META.json', 'w', encoding='utf-8') as f: f.write(json.dumps(json_metadata, indent=4)) """ # create json in desired shape from current metadata file json_metadata_now = {} with open(download_location.format(today.strftime('%Y%m%d'))) as file: csv_filedata = csv.DictReader(file) for row in csv_filedata: key = row['NAPS_ID'] update_date = today.strftime('%Y-%m-%d') for parameter in row: row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter json_metadata_now[key] = row # read standardised file to compare! with open('/esarchive/obs/ghost/CANADA_NAPS/metadata/processed/CANADA_NAPS_META.json', 'r', encoding='utf-8') as f: json_metadata = json.loads(f.read()) for station in json_metadata: # loop through all the old stations if station in json_metadata_now.keys(): # if station is in current meta data, go on for parameter in json_metadata[station]: if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file # if different value, append the standardised metadeta file print("new {} --- old {}".format(json_metadata_now[station][parameter]['values'][0], json_metadata[station][parameter]['values'][-1])) json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0]) json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0]) else: pass else: print('Station {} was abolished'.format(station)) for station in json_metadata_now: # loop through all the new stations if station in json_metadata.keys(): # if station is in old meta data pass # comparison was done before else: # new station appeared! print('New station {}'.format(station)) json_metadata.update({station: json_metadata_now[station]}) # safe with open('/esarchive/obs/ghost/CANADA_NAPS/metadata/processed/CANADA_NAPS_META.json', 'w', encoding='utf-8') as f: f.write(json.dumps(json_metadata, indent=4))"""