MITECO_download.py

import requests
import time
import pandas as pd
from datetime import date
from datetime import timedelta
import os.path
import urllib
import tarfile
import shutil
import zipfile
import re
import glob

from selenium import webdriver
from bs4 import BeautifulSoup
from chromedriver_py import binary_path
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC


def scraper(mode, version):

    baseurl = 'https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos-2001-2021.html'


    if mode == 'all':
        bdate = date(2001, 1, 1) #date(1960, 1, 1) # date before record starts
        edate = date.today()

        os.makedirs('/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version), exist_ok=True)

        download_location = '/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version)

    elif mode == 'nrt':
        bdate = date(2024, 3, 2) #date.today() - timedelta(days = 1) # if code is run after 2 am, data from previous day will be available
        edate = date(2024, 3, 3) #date.today() - timedelta(days = 1)
        download_location = '/esarchive/obs/ghost/MITECO/original_files/nrt/'

    else:
        print('time mode inapplicable')


    # create date array; format to YYYYMMDD
    years_until_2015 = pd.date_range(bdate, date(2015, 1, 1), freq='Y').strftime('%Y').tolist()
    years_after_2015 = pd.date_range(date(2016, 1, 1), edate, freq='Y').strftime('%Y').tolist()
    print(years_after_2015)

    # set up driver
    options = Options()
    prefs = {'download.default_directory' : download_location}
    options.add_experimental_option('prefs', prefs)
    options.add_argument("--no-sandbox")
    #options.add_argument("--headless")

    svc = webdriver.ChromeService(executable_path=binary_path)
    driver = webdriver.Chrome(service=svc, options=options)

    # open url
    driver.get(baseurl)

    # find zip links
    html = driver.page_source
    soup = BeautifulSoup(html, features="html.parser")
    zip_links = soup.find_all("a", href=re.compile(r".zip"))
    
    for zip_link in zip_links:
        filename = zip_link.get("href").rpartition('/')[-1]
        url = 'https://www.miteco.gob.es/{}'.format(zip_link.get("href"))

        r = requests.get(url, timeout=120)
        if r.status_code == 200:
            urllib.request.urlretrieve(url, download_location+filename)
            print('Downloaded {}'.format(filename))

            # unzip
            with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
                zip_ref.extractall(download_location)
    
            os.remove(download_location+filename)
                
        else:
            print('No {}'.format(url))

        time.sleep(1)
    

    # go to hyperlinks

    for year in years_after_2015:

        driver.get('https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos_oficiales_{}.html'.format(year))
        if year == '2022':
                driver.get('https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos-oficiales-2022.html')

        time.sleep(3)

        html = driver.page_source
        soup = BeautifulSoup(html, features="html.parser")
        zip_links = soup.find_all("a", href=re.compile(r".zip"))
        
        
        for zip_link in zip_links:
            filename = zip_link.get("href").rpartition('/')[-1]
            #print(filename)
            url = 'https://www.miteco.gob.es/{}'.format(zip_link.get("href"))
            
            if year == '2022':
                driver.get(url)
                time.sleep(5)
                # unzip
                for zip_file in glob.glob(download_location+'*.zip'):
                    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
                        zip_ref.extractall(download_location)
                    os.remove(zip_file)

                continue

            r = requests.get(url, timeout=120)
            if r.status_code == 200:
                urllib.request.urlretrieve(url, download_location+filename)
                print('Downloaded {}'.format(filename))

                # unzip
                with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
                    zip_ref.extractall(download_location)
        
                os.remove(download_location+filename)
                    
            else:
                print('No {}'.format(url))

            time.sleep(1)

    # delete metadata
    for metadata in glob.glob(download_location+'*.xls'):
        os.remove(metadata)

    # move files around
    alldirectories =[directory for directory in os.listdir(download_location) if not os.path.isfile(os.path.join(download_location, directory))]
    for directory in alldirectories:
        allfiles = os.listdir(os.path.join(download_location, directory))
        for f in allfiles:
            os.rename(os.path.join(download_location, directory, f), os.path.join(download_location, f))
        try:
            shutil.rmtree(os.path.join(download_location, directory))
        except:
            pass


    driver.close()