from selenium import webdriver from bs4 import BeautifulSoup import requests import hashlib from datetime import date from datetime import timedelta import pandas import os.path import urllib import time import ssl import zipfile from compare_two_files import compare_files def scraper(mode, version): base_url = "https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/monitoring-of-atmospheric-gases/ground-level-ozone/" file = "AtmosphericGases-GroundLevelOzone-CAPMoN-AllSites-{}.csv" baseurl_ions = 'https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/monitoring-of-atmospheric-precipitation-chemistry/major-ions/AtmosphericPrecipitationChemistry-MajorIons-CAPMoN-AllSites-{}.csv' components = ['O3', 'Particulate_Metals'] if mode == 'all': bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts edate = date.today() + timedelta(days = 365) # create download directory for component in components: os.makedirs('/esarchive/obs/ghost/CAPMoN/original_files/{}/{}/'.format(version, component), exist_ok=True) os.makedirs('/esarchive/obs/ghost/CAPMoN/original_files/{}/precip/major-ions/'.format(version), exist_ok=True) elif mode == 'nrt': print("No nrt data for CAPMoN network.") quit() else: print('time mode inapplicable') quit() # create date array, per year years = pandas.date_range(bdate, edate, freq='Y').strftime('%Y').tolist() print(years) # ozone download_location = '/esarchive/obs/ghost/CAPMoN/original_files/{}/{}/'.format(version, 'O3') download_location_wetdep = '/esarchive/obs/ghost/CAPMoN/original_files/{}/precip/major-ions/'.format(version) for year in years: url = base_url + file.format(year) Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'} r = requests.get(url, headers=Headers, timeout=120) if r.status_code == 200: open(download_location + file.format(year), "wb").write(r.content) print('Downloaded {}'.format(file.format(year))) elif r.status_code == 403: print("Permission denied for {}".format(file.format(year))) else: print(file.format(year) + " download failed or no data") time.sleep(1) # major ions in wetdep res = requests.get(baseurl_ions.format(year), headers=Headers, timeout=120) time.sleep(1) if res.status_code == 200: with open(download_location_wetdep+"precip/major-ions/"+os.path.basename(baseurl_ions.format(year)), 'wb') as outfile: outfile.write(res.content) print('Downloaded {}'.format(os.path.basename(baseurl_ions.format(year)))) elif res.status_code == 404: print("No major ions data in {}".format(year)) else: print("Problem with major ions download") # particulate metals base_url = "https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/monitoring-of-atmospheric-particles/particulate-metals/" file = "AtmosphericParticles-ParticulateMetals-GLBM-MultipleSites-1988_2017.csv" download_location = '/esarchive/obs/ghost/CAPMoN/original_files/1.6/Particulate_Metals/' url = base_url + file Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'} r = requests.get(url, headers=Headers, timeout=120) if r.status_code == 200: open(download_location + file, "wb").write(r.content) print('Downloaded ' + file) elif r.status_code == 403: print("Permission denied for {}".format(file)) else: print(file + " download failed or no data")