Newer
Older
from selenium import webdriver
from chromedriver_py import binary_path
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
import requests
import time
from datetime import date
from datetime import timedelta
import pandas as pd
import os.path
import urllib
import time
import ssl
import zipfile
import shutil
import os
import re
def download_data(mode, version, n_max_tries, max_time_per_dl):
variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu']
variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre']
time_resolutions_website = ["registro diario", "registro horario"]
variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website!
time_resolution_ghost = ['daily', 'hourly']
metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META.txt")
baseurl = 'https://sinca.mma.gob.cl/index.php/'
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
if mode == 'all':
# create download directory
os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)
elif mode == 'nrt':
bdate = date(date.today().year, 1, 1).strftime('%Y%m%d')[2:] #"240101"
edate = date.today().strftime('%Y%m%d')[2:]
print(edate)
# create download directory
version = mode
os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)
else:
print('time mode inapplicable')
n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
try:
# set up driver
options = Options()
prefs = {'download.default_directory' : download_location}
options.add_experimental_option('prefs', prefs)
options.add_argument("--no-sandbox")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu")
options.add_argument("--disable-dev-shm-usage")
if n_tries > 0:
options.add_argument("--headless")
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
driver.get(baseurl)
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded
# navigate to regions
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
for region in regions:
print("Region is "+region.getText())
driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
time.sleep(3)
# navigate to station and component
html_region = driver.page_source
soup_region = BeautifulSoup(html_region, features="html.parser")
a_titles = [a.get("title") for a in soup_region.find_all("a", title=True)] # all links
# from all the links, choose only the components
stations_components = []
for a_title in a_titles:
for variable_text in variables_text:
if variable_text in a_title:
stations_components.append(soup_region.find("a", {"title": a_title}))
# loop through all stations and components of the region
for station_component in stations_components:
print(station_component.get("title"))
station = station_component.get("title").split("| ", 1)[1]
component = [x for x in variables_text if x in station_component.get("title")][0] # get component name on website
component_choose_time_res = variables_website[variables_text.index(component)] # get component name for choosing time resolution
component_ghost = variables_ghost[variables_text.index(component)] # get component name accordingly in ghost
# create storage directory
try:
station_id = metadata15["station_reference"][metadata15["station_name"] == station].iloc[0]
os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id), exist_ok=True)
storage_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id)
except:
print("{} added since 1.5, no station_id found in metadata_1.5, did not download data from new station".format(station))
continue
# go to data on website
driver.get('https:'+station_component.get("href"))
# select time resolution
dropdown_element = driver.find_element(By.ID, 'ic')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
i=0
for time_resolution in time_resolutions_website:
#select time resolution if existent!
if (component_choose_time_res+' - '+time_resolution) in options:
select.select_by_visible_text(component_choose_time_res+' - '+time_resolution)
#print("Time resolution is: {}".format(time_resolution_ghost[i]))
time.sleep(5)
if mode == "all":
start_date = driver.find_element(By.ID, "from").get_attribute("value")
end_date = driver.find_element(By.ID, "to").get_attribute("value")
if mode == "nrt": # updating dates difficult
start_date = driver.find_element(By.ID, "from")
driver.execute_script("arguments[0].value = {};".format(bdate), start_date)
end_date = driver.find_element(By.ID, "to")
driver.execute_script("arguments[0].value = {};".format(edate), end_date)
driver.switch_to.default_content()
driver.switch_to.frame("right")
time.sleep(10)
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.LINK_TEXT, "Excel CSV"))) # wait till loaded
driver.find_element(By.LINK_TEXT, "Excel CSV").click()
# wait until download finished
while not os.path.exists("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)):
time.sleep(1)
if os.path.isfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)):
print('{} {} download successful'.format(station_component.get("title"), time_resolution_ghost[i]))
shutil.copyfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date), storage_location+component_ghost+'_'+time_resolution_ghost[i]+'.csv')
os.remove("{}/datos_{}_{}.csv".format(download_location, start_date, end_date))
driver.switch_to.default_content()
driver.switch_to.frame("left")
i=i+1
driver.close()
errcode = 200
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
if n_tries == n_max_tries:
print('Failed downloading CHILE_SINCA data {} times in {} seconds'.format(n_tries, max_time_per_dl))
def download_metadata(n_max_tries, max_time_per_dl):
# paths and variables
variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu']
variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre']
time_resolutions_website = ["registro diario", "registro horario"]
variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website!
time_resolution_ghost = ['daily', 'hourly']
baseurl = 'https://sinca.mma.gob.cl/index.php/'
today = date.today()
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/temp/CHILE_SINCA_META_temp.txt")
#print(metadata15)
instruments_old = ['O3_instrument','NO_instrument','NO2_instrument','CO_instrument','CH4_instrument','SO2_instrument','NMHC_instrument','HC_instrument','PM10_instrument','PM2.5_instrument','As_instrument','Cu_instrument','Pb_instrument']
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'r', encoding='utf-8') as f:
metadata_old = json.loads(f.read())
n_tries = 0
errcode = 999
metadata_new = {}
while (n_tries < n_max_tries) and (errcode != 200):
try:
# set up driver
options = Options()
#prefs = {'download.default_directory' : download_location}
#options.add_experimental_option('prefs', prefs)
options.add_argument("--no-sandbox")
options.add_argument("--headless")
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
# open url
driver.get(baseurl)
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded
# navigate to regions
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
time.sleep(3)
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
stations = soup.find_all("a", href=re.compile(r"^/index.php/estacion/index/id/"))
station_name_new = station.getText()
print(station_name_new)
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
driver.get("https://sinca.mma.gob.cl/"+station.get("href"))
time.sleep(3)
# get meta info
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
region = soup.find("th", text="Región").find_next_sibling().getText()
province = soup.find("th", text="Provincia").find_next_sibling().getText()
commune = soup.find("th", text="Comuna").find_next_sibling().getText()
UTM_coordinates = soup.find("th", text="Coordenadas UTM").find_next_sibling().getText().replace(' ', '')
lon = UTM_coordinates.split('E')[0]+'E'
lat = UTM_coordinates.split('E')[1]
timezone = soup.find("th", text="Huso horario").find_next_sibling().getText()
ins_table = soup.find('table', id="medicion")
instruments = ins_table.find_all("td", {"class": "helpTecnica center"})
instruments_per_component = {}
for instrument in instruments:
component = instrument.find_parent().find('a').getText()
if len(component) > 5: # filter for short names that were already given and don't need to be renamed
component = variables_ghost[variables_text.index(component)]
elif 'MP 1,5' in component:
component = 'MP2.5'
#======
if "No informado" in instrument.getText():
instruments_per_component[component] = None
else:
instrument_name = re.sub(' +', ' ', instrument.getText())
instrument_name = instrument_name.split("\n")[-1]
instruments_per_component[component] = instrument_name
print(instruments_per_component)
for station_reference in metadata_old:
if metadata_old[station_reference]['station_name']['values'][0] == station_name_new: # match station with previously referenced station reference from old file
i=0
scraped_metadata = [station_reference, station_name_new, region, province, commune, lon, lat, timezone]
for parameter in metadata_old[station_reference]:
#print(parameter)
if "instrument" and "comments" not in parameter:
metadata_new[station_reference].update({parameter: {"values": [scraped_metadata[i]], "update_time": [today.strftime('%Y-%m-%d')]}})
i=+1
"""
elif "comments" == parameter:
metadata_new.update({station_reference: {parameter: {"values": [None], "update_time": [today.strftime('%Y-%m-%d')]}}})
#metadata_new[station_reference] = {parameter: {"values": [None], "update_time": [today.strftime('%Y-%m-%d')]}}
else:
for component in instruments_per_component:
if component in parameter:
#metadata_new[station_reference] = {parameter: {"values": [instruments_per_component[component]], "update_time": [today.strftime('%Y-%m-%d')]}}
metadata_new.update({station_reference: {parameter: {"values": [instruments_per_component[component]], "update_time": [today.strftime('%Y-%m-%d')]}}})
else:
#metadata_new[station_reference] = {parameter: {"values": [None], "update_time": [today.strftime('%Y-%m-%d')]}}
metadata_new.update({station_reference: {parameter: {"values": [None], "update_time": [today.strftime('%Y-%m-%d')]}}})"""
print(metadata_new)
"""
metadata15_per_station = metadata15.loc[metadata15["station_name"] == station_name]
print(region)
print(metadata15_per_station)
print(metadata15_per_station["region"].iloc[0])
i=0
for column in metadata15_per_station.head():
print(column)
if metadata15_per_station[column].iloc[0] == scraped_metadata[i]:
print("ok!")
else:
print("not ok")
i=i+1"""
driver.close()
errcode = 200
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
if n_tries == n_max_tries:
print('Failed downloading CHILE_SINCA metadata {} times in {} seconds'.format(n_tries, max_time_per_dl))
print(metadata_new)
"""
# create json from original metadata file =====================================================================================
json_metadata = {}
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META.txt', 'r') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['station_reference']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))"""
"""
# read standardised file to compare!
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'r', encoding='utf-8') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# safe
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(json_metadata, indent=4))"""