Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from selenium import webdriver
from chromedriver_py import binary_path
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
import requests
import time
from datetime import date
from datetime import timedelta
import pandas as pd
import os.path
import urllib
import time
import ssl
import zipfile
import shutil
import os
import re
def scraper(mode, version):
# paths and variables
variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu'] # complete list later
variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre']
time_resolutions_website = ["registro diario", "registro horario"] # complete later
variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website!
time_resolution_ghost = ['daily', 'hourly']
metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META.txt")
baseurl = 'https://sinca.mma.gob.cl/index.php/'
# only for nrt
bdate = "240101"
edate = date.today().strftime('%Y%m%d')[2:]
print(edate)
# create download directory
os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)
# set up driver
options = Options()
prefs = {'download.default_directory' : download_location}
options.add_experimental_option('prefs', prefs)
options.add_argument("--no-sandbox")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu")
options.add_argument("--disable-dev-shm-usage")
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
# open url
driver.get(baseurl)
WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded
# navigate to regions
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
for region in regions:
print("Region is "+region.getText())
driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
time.sleep(3)
# navigate to station and component
html_region = driver.page_source
soup_region = BeautifulSoup(html_region, features="html.parser")
a_titles = [a.get("title") for a in soup_region.find_all("a", title=True)] # all links
# from all the links, choose only the components
stations_components = []
for a_title in a_titles:
for variable_text in variables_text:
if variable_text in a_title:
stations_components.append(soup_region.find("a", {"title": a_title}))
# loop through all stations and components of the region
for station_component in stations_components:
print(station_component.get("title"))
station = station_component.get("title").split("| ", 1)[1]
component = [x for x in variables_text if x in station_component.get("title")][0] # get component name on website
component_choose_time_res = variables_website[variables_text.index(component)] # get component name for choosing time resolution
component_ghost = variables_ghost[variables_text.index(component)] # get component name accordingly in ghost
# create storage directory
try:
station_id = metadata15["station_reference"][metadata15["station_name"] == station].iloc[0]
os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id), exist_ok=True)
storage_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id)
except:
print("{} added since 1.5, no station_id found in metadata_1.5, did not download data from new station".format(station))
continue
# go to data on website
driver.get('https:'+station_component.get("href"))
time.sleep(5)
driver.switch_to.frame("left")
# select time resolution
dropdown_element = driver.find_element(By.ID, 'ic')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
i=0
for time_resolution in time_resolutions_website:
#select time resolution if existent!
if (component_choose_time_res+' - '+time_resolution) in options:
select.select_by_visible_text(component_choose_time_res+' - '+time_resolution)
#print("Time resolution is: {}".format(time_resolution_ghost[i]))
time.sleep(5)
if mode == "all":
start_date = driver.find_element(By.ID, "from").get_attribute("value")
end_date = driver.find_element(By.ID, "to").get_attribute("value")
if mode == "nrt": # updating dates difficult
start_date = driver.find_element(By.ID, "from")
driver.execute_script("arguments[0].value = {};".format(bdate), start_date)
end_date = driver.find_element(By.ID, "to")
driver.execute_script("arguments[0].value = {};".format(edate), end_date)
time.sleep(10)
driver.switch_to.default_content()
driver.switch_to.frame("right")
time.sleep(10)
WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.LINK_TEXT, "Excel CSV"))) # wait till loaded
driver.find_element(By.LINK_TEXT, "Excel CSV").click()
# wait until download finished
while not os.path.exists("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)):
time.sleep(1)
if os.path.isfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)):
print('{} {} download successful'.format(station_component.get("title"), time_resolution_ghost[i]))
shutil.copyfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date), storage_location+component_ghost+'_'+time_resolution_ghost[i]+'.csv')
os.remove("{}/datos_{}_{}.csv".format(download_location, start_date, end_date))
driver.switch_to.default_content()
driver.switch_to.frame("left")
i=i+1
driver.close()
def scraper_metadata(mode, version):
baseurl = 'https://sinca.mma.gob.cl/index.php/'
metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/temp/CHILE_SINCA_META_temp.txt")
print(metadata15)
# set up driver
options = Options()
#prefs = {'download.default_directory' : download_location}
#options.add_experimental_option('prefs', prefs)
options.add_argument("--no-sandbox")
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
# open url
driver.get(baseurl)
WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded
# navigate to regions
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
for region in regions:
print("Region is "+region.getText())
driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
time.sleep(3)
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
stations = soup.find_all("a", href=re.compile(r"^/index.php/estacion/index/id/"))
for station in stations:
station_name = station.getText()
print(station_name)
driver.get("https://sinca.mma.gob.cl/"+station.get("href"))
time.sleep(3)
# get meta info
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
region = soup.find("th", text="Región").find_next_sibling().getText()
province = soup.find("th", text="Provincia").find_next_sibling().getText()
commune = soup.find("th", text="Comuna").find_next_sibling().getText()
UTM_coordinates = soup.find("th", text="Coordenadas UTM").find_next_sibling().getText()
timezone = soup.find("th", text="Huso horario").find_next_sibling().getText()
scraped_metadata = [station_reference, station_name, region, province, commune, UTM_coordinates, timezone]
metadata15_per_station = metadata15.loc[metadata15["station_name"] == station_name]
print(region)
print(metadata15_per_station)
print(metadata15_per_station["region"].iloc[0])
i=0
for column in metadata15_per_station.head():
print(column)
if metadata15_per_station[column].iloc[0] == scraped_metadata[i]:
print("ok!")
else:
print("not ok")
i=i+1
driver.close()