import requests import time import pandas as pd from datetime import date from datetime import timedelta import os.path import urllib import tarfile import shutil import zipfile import re import glob from selenium import webdriver from bs4 import BeautifulSoup from chromedriver_py import binary_path from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import Select from selenium.webdriver.support import expected_conditions as EC def download_data(mode, version, n_max_tries, max_time_per_dl): baseurl = 'https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos-2001-2021.html' if mode == 'all': bdate = date(2001, 1, 1) #date(1960, 1, 1) # date before record starts edate = date.today() os.makedirs('/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version), exist_ok=True) download_location = '/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version) elif mode == 'nrt': print("nrt not available") download_location = '/esarchive/obs/ghost/MITECO/original_files/nrt/' else: print('time mode inapplicable') # create date array; format to YYYYMMDD years_until_2015 = pd.date_range(bdate, date(2015, 1, 1), freq='Y').strftime('%Y').tolist() years_after_2015 = pd.date_range(date(2016, 1, 1), edate, freq='Y').strftime('%Y').tolist() print(years_after_2015) # set up driver options = Options() prefs = {'download.default_directory' : download_location} options.add_experimental_option('prefs', prefs) options.add_argument("--no-sandbox") #options.add_argument("--headless") svc = webdriver.ChromeService(executable_path=binary_path) driver = webdriver.Chrome(service=svc, options=options) # open url driver.get(baseurl) # find zip links html = driver.page_source soup = BeautifulSoup(html, features="html.parser") zip_links = soup.find_all("a", href=re.compile(r".zip")) for zip_link in zip_links: filename = zip_link.get("href").rpartition('/')[-1] url = 'https://www.miteco.gob.es/{}'.format(zip_link.get("href")) n_tries = 0 errcode = 999 while (n_tries < n_max_tries) and (errcode != 200): r = requests.get(url, timeout=max_time_per_dl) if r.status_code == 200: urllib.request.urlretrieve(url, download_location+filename) print('Downloaded {}'.format(filename)) # unzip with zipfile.ZipFile(download_location+filename, 'r') as zip_ref: zip_ref.extractall(download_location) os.remove(download_location+filename) errcode = r.status_code elif r.status_code == 404: print("No data found, error 404") errcode = 200 elif r.status_code == 403: print("Permission denied for {}".format(url)) errcode = 200 else: # try again print('Response error {}, attempt {}'.format(r.status_code, n_tries)) errcode = r.status_code n_tries += 1 max_time_per_dl = max_time_per_dl*2 time.sleep(n_tries ** 2) # wait a lil more every time if n_tries == n_max_tries: print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode)) time.sleep(1) # go to hyperlinks for year in years_after_2015: driver.get('https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos_oficiales_{}.html'.format(year)) if year == '2022': driver.get('https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos-oficiales-2022.html') time.sleep(3) html = driver.page_source soup = BeautifulSoup(html, features="html.parser") zip_links = soup.find_all("a", href=re.compile(r".zip")) for zip_link in zip_links: filename = zip_link.get("href").rpartition('/')[-1] #print(filename) url = 'https://www.miteco.gob.es/{}'.format(zip_link.get("href")) if year == '2022': driver.get(url) time.sleep(5) # unzip for zip_file in glob.glob(download_location+'*.zip'): with zipfile.ZipFile(zip_file, 'r') as zip_ref: zip_ref.extractall(download_location) os.remove(zip_file) continue n_tries = 0 errcode = 999 while (n_tries < n_max_tries) and (errcode != 200): r = requests.get(url, timeout=max_time_per_dl) if r.status_code == 200: urllib.request.urlretrieve(url, download_location+filename) print('Downloaded {}'.format(filename)) # unzip with zipfile.ZipFile(download_location+filename, 'r') as zip_ref: zip_ref.extractall(download_location) os.remove(download_location+filename) errcode = r.status_code elif r.status_code == 404: print("No data found, error 404") errcode = 200 elif r.status_code == 403: print("Permission denied for {}".format(url)) errcode = 200 else: # try again print('Response error {}, attempt {}'.format(r.status_code, n_tries)) errcode = r.status_code n_tries += 1 max_time_per_dl = max_time_per_dl*2 time.sleep(n_tries ** 2) # wait a lil more every time if n_tries == n_max_tries: print('Failed downloading {} {} times in {} seconds, error code {}'.format(url, n_tries, max_time_per_dl, errcode)) time.sleep(1) # delete metadata for metadata in glob.glob(download_location+'*.xls'): os.remove(metadata) # move files around alldirectories =[directory for directory in os.listdir(download_location) if not os.path.isfile(os.path.join(download_location, directory))] for directory in alldirectories: allfiles = os.listdir(os.path.join(download_location, directory)) for f in allfiles: os.rename(os.path.join(download_location, directory, f), os.path.join(download_location, f)) try: shutil.rmtree(os.path.join(download_location, directory)) except: pass driver.close() def download_metadata(n_max_tries, max_time_per_dl): url_metadata = 'https://www.miteco.gob.es/content/dam/miteco/es/calidad-y-evaluacion-ambiental/sgalsi/atm%C3%B3sfera-y-calidad-del-aire/evaluaci%C3%B3n-2022/Metainformacion2022.xlsx' download_location = "/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META_{}.xlsx" Headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'} r = requests.get(url_metadata, headers=Headers, timeout=max_time_per_dl) n_tries = 0 errcode = 999 today = date.today() while (n_tries < n_max_tries) and (errcode != 200): if r.status_code == 200: with open(download_location.format(today.strftime('%Y%m%d')), 'wb') as outfile: outfile.write(r.content) print('Downloaded metadata') errcode = r.status_code elif r.status_code == 404: print("No metadata found, error 404") errcode = 200 else: # try again print('Response error {}, attempt {}'.format(r.status_code, n_tries)) errcode = r.status_code n_tries += 1 time.sleep(n_tries ** 2) # wait a lil more every time if n_tries == n_max_tries: print('Failed downloading {} {} times in {} seconds, error code {}'.format(url_metadata, n_tries, max_time_per_dl, errcode)) time.sleep(1) # convert to csv file = pd.read_excel(download_location.format(today.strftime('%Y%m%d'))) file.to_csv('/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META_{}.csv'.format(today.strftime('%Y%m%d')), index=False, header=True) """# create json from original metadata file json_metadata = {} with open('/esarchive/obs/ghost/MITECO/metadata/network_provided/MITECO_META.csv', 'r', encoding='ISO-8859-1') as file: csv_filedata = csv.DictReader(file) for row in csv_filedata: key = row['SiteName_NomDuSite'] update_date = today.strftime('%Y-%m-%d') for parameter in row: row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter json_metadata[key] = row with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'w', encoding='utf-8') as f: f.write(json.dumps(json_metadata, indent=4)) # create json in desired shape from current metadata file json_metadata_now = {} with open(download_location.format(today.strftime('%Y%m%d')), encoding='ISO-8859-1') as file: csv_filedata = csv.DictReader(file) for row in csv_filedata: key = row['SiteName_NomDuSite'] update_date = today.strftime('%Y-%m-%d') for parameter in row: row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter json_metadata_now[key] = row # read standardised file to compare! with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'r', encoding='ISO-8859-1') as f: json_metadata = json.loads(f.read()) for station in json_metadata: # loop through all the old stations if station in json_metadata_now.keys(): # if station is in current meta data, go on for parameter in json_metadata[station]: if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file # if different value, append the standardised metadeta file print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1]), json_metadata_now[station][parameter]['values'][0]) json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0]) json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0]) else: pass else: print('Station {} was abolished'.format(station)) for station in json_metadata_now: # loop through all the new stations if station in json_metadata.keys(): # if station is in old meta data pass # comparison was done before else: # new station appeared! print('New station {}'.format(station)) json_metadata.update({station: json_metadata_now[station]}) # safe with open('/esarchive/obs/ghost/MITECO/metadata/processed/MITECO_META.json', 'w', encoding='ISO-8859-1') as f: f.write(json.dumps(json_metadata, indent=4))"""