Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import requests
import time
import pandas as pd
from datetime import date
from datetime import timedelta
import os.path
import urllib
import tarfile
import shutil
import zipfile
import re
import glob
from selenium import webdriver
from bs4 import BeautifulSoup
from chromedriver_py import binary_path
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
def scraper(mode, version):
baseurl = 'https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos-2001-2021.html'
if mode == 'all':
bdate = date(2001, 1, 1) #date(1960, 1, 1) # date before record starts
edate = date.today()
os.makedirs('/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/MITECO/original_files/{}/'.format(version)
elif mode == 'nrt':
bdate = date(2024, 3, 2) #date.today() - timedelta(days = 1) # if code is run after 2 am, data from previous day will be available
edate = date(2024, 3, 3) #date.today() - timedelta(days = 1)
download_location = '/esarchive/obs/ghost/MITECO/original_files/nrt/'
else:
print('time mode inapplicable')
# create date array; format to YYYYMMDD
years_until_2015 = pd.date_range(bdate, date(2015, 1, 1), freq='Y').strftime('%Y').tolist()
years_after_2015 = pd.date_range(date(2016, 1, 1), edate, freq='Y').strftime('%Y').tolist()
print(years_after_2015)
# set up driver
options = Options()
prefs = {'download.default_directory' : download_location}
options.add_experimental_option('prefs', prefs)
options.add_argument("--no-sandbox")
#options.add_argument("--headless")
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
# open url
driver.get(baseurl)
# find zip links
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
zip_links = soup.find_all("a", href=re.compile(r".zip"))
for zip_link in zip_links:
filename = zip_link.get("href").rpartition('/')[-1]
url = 'https://www.miteco.gob.es/{}'.format(zip_link.get("href"))
r = requests.get(url, timeout=120)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+filename)
print('Downloaded {}'.format(filename))
# unzip
with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
zip_ref.extractall(download_location)
os.remove(download_location+filename)
else:
print('No {}'.format(url))
time.sleep(1)
# go to hyperlinks
for year in years_after_2015:
driver.get('https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos_oficiales_{}.html'.format(year))
if year == '2022':
driver.get('https://www.miteco.gob.es/es/calidad-y-evaluacion-ambiental/temas/atmosfera-y-calidad-del-aire/calidad-del-aire/evaluacion-datos/datos/datos-oficiales-2022.html')
time.sleep(3)
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
zip_links = soup.find_all("a", href=re.compile(r".zip"))
for zip_link in zip_links:
filename = zip_link.get("href").rpartition('/')[-1]
#print(filename)
url = 'https://www.miteco.gob.es/{}'.format(zip_link.get("href"))
if year == '2022':
driver.get(url)
time.sleep(5)
# unzip
for zip_file in glob.glob(download_location+'*.zip'):
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
zip_ref.extractall(download_location)
os.remove(zip_file)
continue
r = requests.get(url, timeout=120)
if r.status_code == 200:
urllib.request.urlretrieve(url, download_location+filename)
print('Downloaded {}'.format(filename))
# unzip
with zipfile.ZipFile(download_location+filename, 'r') as zip_ref:
zip_ref.extractall(download_location)
os.remove(download_location+filename)
else:
print('No {}'.format(url))
time.sleep(1)
# delete metadata
for metadata in glob.glob(download_location+'*.xls'):
os.remove(metadata)
# move files around
alldirectories =[directory for directory in os.listdir(download_location) if not os.path.isfile(os.path.join(download_location, directory))]
for directory in alldirectories:
allfiles = os.listdir(os.path.join(download_location, directory))
for f in allfiles:
os.rename(os.path.join(download_location, directory, f), os.path.join(download_location, f))
try:
shutil.rmtree(os.path.join(download_location, directory))
except:
pass
driver.close()