import requests import time import pandas as pd from datetime import date from datetime import timedelta import os.path import urllib import tarfile import shutil import zipfile import re import glob from selenium import webdriver from bs4 import BeautifulSoup from chromedriver_py import binary_path from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import Select from selenium.webdriver.support import expected_conditions as EC def scraper(mode, version): baseurl = 'https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos-2001-2021.html' if mode == 'all': bdate = date(2001, 1, 1) #date(1960, 1, 1) # date before record starts edate = date.today() os.makedirs('/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version), exist_ok=True) download_location = '/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version) elif mode == 'nrt': bdate = date(2024, 3, 2) #date.today() - timedelta(days = 1) # if code is run after 2 am, data from previous day will be available edate = date(2024, 3, 3) #date.today() - timedelta(days = 1) download_location = '/esarchive/obs/ghost/MITECO/original_files/nrt/' else: print('time mode inapplicable') # create date array; format to YYYYMMDD years_until_2015 = pd.date_range(bdate, date(2015, 1, 1), freq='Y').strftime('%Y').tolist() years_after_2015 = pd.date_range(date(2016, 1, 1), edate, freq='Y').strftime('%Y').tolist() print(years_after_2015) # set up driver options = Options() prefs = {'download.default_directory' : download_location} options.add_experimental_option('prefs', prefs) options.add_argument("--no-sandbox") #options.add_argument("--headless") svc = webdriver.ChromeService(executable_path=binary_path) driver = webdriver.Chrome(service=svc, options=options) # open url driver.get(baseurl) # find zip links html = driver.page_source soup = BeautifulSoup(html, features="html.parser") zip_links = soup.find_all("a", href=re.compile(r".zip")) for zip_link in zip_links: filename = zip_link.get("href").rpartition('/')[-1] url = 'https://www.miteco.gob.es/{}'.format(zip_link.get("href")) r = requests.get(url, timeout=120) if r.status_code == 200: urllib.request.urlretrieve(url, download_location+filename) print('Downloaded {}'.format(filename)) # unzip with zipfile.ZipFile(download_location+filename, 'r') as zip_ref: zip_ref.extractall(download_location) os.remove(download_location+filename) else: print('No {}'.format(url)) time.sleep(1) # go to hyperlinks for year in years_after_2015: driver.get('https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos_oficiales_{}.html'.format(year)) if year == '2022': driver.get('https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos-oficiales-2022.html') time.sleep(3) html = driver.page_source soup = BeautifulSoup(html, features="html.parser") zip_links = soup.find_all("a", href=re.compile(r".zip")) for zip_link in zip_links: filename = zip_link.get("href").rpartition('/')[-1] #print(filename) url = 'https://www.miteco.gob.es/{}'.format(zip_link.get("href")) if year == '2022': driver.get(url) time.sleep(5) # unzip for zip_file in glob.glob(download_location+'*.zip'): with zipfile.ZipFile(zip_file, 'r') as zip_ref: zip_ref.extractall(download_location) os.remove(zip_file) continue r = requests.get(url, timeout=120) if r.status_code == 200: urllib.request.urlretrieve(url, download_location+filename) print('Downloaded {}'.format(filename)) # unzip with zipfile.ZipFile(download_location+filename, 'r') as zip_ref: zip_ref.extractall(download_location) os.remove(download_location+filename) else: print('No {}'.format(url)) time.sleep(1) # delete metadata for metadata in glob.glob(download_location+'*.xls'): os.remove(metadata) # move files around alldirectories =[directory for directory in os.listdir(download_location) if not os.path.isfile(os.path.join(download_location, directory))] for directory in alldirectories: allfiles = os.listdir(os.path.join(download_location, directory)) for f in allfiles: os.rename(os.path.join(download_location, directory, f), os.path.join(download_location, f)) try: shutil.rmtree(os.path.join(download_location, directory)) except: pass driver.close()