CAPMoN_download.py 3.76 KB
Newer Older
RAPHAEL GRODOFZIG's avatar
RAPHAEL GRODOFZIG committed
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import hashlib
from datetime import date
from datetime import timedelta
import pandas
import os.path
import urllib
import time
import ssl 
import zipfile
from compare_two_files import compare_files



def scraper(mode, version):

    base_url = "https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/monitoring-of-atmospheric-gases/ground-level-ozone/"
    file = "AtmosphericGases-GroundLevelOzone-CAPMoN-AllSites-{}.csv"

    baseurl_ions = 'https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/monitoring-of-atmospheric-precipitation-chemistry/major-ions/AtmosphericPrecipitationChemistry-MajorIons-CAPMoN-AllSites-{}.csv'

    components = ['O3', 'Particulate_Metals']

    if mode == 'all':
        bdate = date(1980, 1, 1) #date(1960, 1, 1) # date before record starts
        edate = date.today() + timedelta(days = 365)

        # create download directory
        for component in components:
            os.makedirs('/esarchive/obs/ghost/CAPMoN/original_files/{}/{}/'.format(version, component), exist_ok=True)
        os.makedirs('/esarchive/obs/ghost/CAPMoN/original_files/{}/precip/major-ions/'.format(version), exist_ok=True)


    elif mode == 'nrt':
        print("No nrt data for CAPMoN network.")
        quit()

    else:
        print('time mode inapplicable')
        quit()

    # create date array, per year
    years = pandas.date_range(bdate, edate, freq='Y').strftime('%Y').tolist()

    print(years)

    # ozone
    download_location = '/esarchive/obs/ghost/CAPMoN/original_files/{}/{}/'.format(version, 'O3')
    download_location_wetdep = '/esarchive/obs/ghost/CAPMoN/original_files/{}/precip/major-ions/'.format(version)


    for year in years:

        url = base_url + file.format(year)
        Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}

        r = requests.get(url, headers=Headers, timeout=120)

        if r.status_code == 200:
            open(download_location + file.format(year), "wb").write(r.content)
            print('Downloaded {}'.format(file.format(year)))
        elif r.status_code == 403:
            print("Permission denied for {}".format(file.format(year)))
        else:
            print(file.format(year) + " download failed or no data")

        time.sleep(1)


        # major ions in wetdep
        res = requests.get(baseurl_ions.format(year), headers=Headers, timeout=120)
        time.sleep(1)

        if res.status_code == 200:

            with open(download_location_wetdep+"precip/major-ions/"+os.path.basename(baseurl_ions.format(year)), 'wb') as outfile:
                outfile.write(res.content)

            print('Downloaded {}'.format(os.path.basename(baseurl_ions.format(year))))
        
        elif res.status_code == 404:
            print("No major ions data in {}".format(year))
        
        else:
            print("Problem with major ions download")



    # particulate metals
    base_url = "https://data-donnees.az.ec.gc.ca/api/file?path=/air/monitor/monitoring-of-atmospheric-particles/particulate-metals/"
    file = "AtmosphericParticles-ParticulateMetals-GLBM-MultipleSites-1988_2017.csv"
    download_location = '/esarchive/obs/ghost/CAPMoN/original_files/1.6/Particulate_Metals/'


    url = base_url + file
    Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}

    r = requests.get(url, headers=Headers, timeout=120)

    if r.status_code == 200:
        open(download_location + file, "wb").write(r.content)
        print('Downloaded ' + file)
    elif r.status_code == 403:
        print("Permission denied for {}".format(file))
    else:
        print(file + " download failed or no data")