Newer
Older
from selenium import webdriver
from chromedriver_py import binary_path
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
import requests
import time
from datetime import date
from datetime import timedelta
import pandas as pd
import os.path
import urllib
import time
import ssl
import zipfile
import shutil
import os
import re
def download_data(mode, version, n_max_tries, max_time_per_dl):
variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu']
variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre']
time_resolutions_website = ["registro diario", "registro horario"]
variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website!
time_resolution_ghost = ['daily', 'hourly']
metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META.txt")
baseurl = 'https://sinca.mma.gob.cl/index.php/'
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
if mode == 'all':
# create download directory
os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)
elif mode == 'nrt':
bdate = date(date.today().year, 1, 1).strftime('%Y%m%d')[2:] #"240101"
edate = date.today().strftime('%Y%m%d')[2:]
print(edate)
# create download directory
version = mode
os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version), exist_ok=True)
download_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/temp/'.format(version)
else:
print('time mode inapplicable')
n_tries = 0
errcode = 999
while (n_tries < n_max_tries) and (errcode != 200):
try:
# set up driver
options = Options()
prefs = {'download.default_directory' : download_location}
options.add_experimental_option('prefs', prefs)
options.add_argument("--no-sandbox")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu")
options.add_argument("--disable-dev-shm-usage")
if n_tries > 0:
options.add_argument("--headless")
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
driver.get(baseurl)
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded
# navigate to regions
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
for region in regions:
print("Region is "+region.getText())
driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
time.sleep(3)
# navigate to station and component
html_region = driver.page_source
soup_region = BeautifulSoup(html_region, features="html.parser")
a_titles = [a.get("title") for a in soup_region.find_all("a", title=True)] # all links
# from all the links, choose only the components
stations_components = []
for a_title in a_titles:
for variable_text in variables_text:
if variable_text in a_title:
stations_components.append(soup_region.find("a", {"title": a_title}))
# loop through all stations and components of the region
for station_component in stations_components:
print(station_component.get("title"))
station = station_component.get("title").split("| ", 1)[1]
component = [x for x in variables_text if x in station_component.get("title")][0] # get component name on website
component_choose_time_res = variables_website[variables_text.index(component)] # get component name for choosing time resolution
component_ghost = variables_ghost[variables_text.index(component)] # get component name accordingly in ghost
# create storage directory
try:
station_id = metadata15["station_reference"][metadata15["station_name"] == station].iloc[0]
os.makedirs('/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id), exist_ok=True)
storage_location = '/esarchive/obs/ghost/CHILE_SINCA/original_files/{}/{}/'.format(version, station_id)
except:
print("{} added since 1.5, no station_id found in metadata_1.5, did not download data from new station".format(station))
continue
# go to data on website
driver.get('https:'+station_component.get("href"))
# select time resolution
dropdown_element = driver.find_element(By.ID, 'ic')
select = Select(dropdown_element)
options = [opt.get_attribute("text") for opt in select.options]
i=0
for time_resolution in time_resolutions_website:
#select time resolution if existent!
if (component_choose_time_res+' - '+time_resolution) in options:
select.select_by_visible_text(component_choose_time_res+' - '+time_resolution)
#print("Time resolution is: {}".format(time_resolution_ghost[i]))
time.sleep(5)
if mode == "all":
start_date = driver.find_element(By.ID, "from").get_attribute("value")
end_date = driver.find_element(By.ID, "to").get_attribute("value")
if mode == "nrt": # updating dates difficult
start_date = driver.find_element(By.ID, "from")
driver.execute_script("arguments[0].value = {};".format(bdate), start_date)
end_date = driver.find_element(By.ID, "to")
driver.execute_script("arguments[0].value = {};".format(edate), end_date)
driver.switch_to.default_content()
driver.switch_to.frame("right")
time.sleep(10)
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.LINK_TEXT, "Excel CSV"))) # wait till loaded
driver.find_element(By.LINK_TEXT, "Excel CSV").click()
# wait until download finished
while not os.path.exists("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)):
time.sleep(1)
if os.path.isfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date)):
print('{} {} download successful'.format(station_component.get("title"), time_resolution_ghost[i]))
shutil.copyfile("{}/datos_{}_{}.csv".format(download_location, start_date, end_date), storage_location+component_ghost+'_'+time_resolution_ghost[i]+'.csv')
os.remove("{}/datos_{}_{}.csv".format(download_location, start_date, end_date))
driver.switch_to.default_content()
driver.switch_to.frame("left")
i=i+1
driver.close()
errcode = 200
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
if n_tries == n_max_tries:
print('Failed downloading CHILE_SINCA data {} times in {} seconds'.format(n_tries, max_time_per_dl))
def download_metadata(n_max_tries, max_time_per_dl):
# paths and variables
variables_website = ['MP 10','MP 2,5', 'MP 10discreto', 'SO2', 'NO2', 'NOX', 'NO', 'CO', 'O3', 'Pb', 'As', 'Cu']
variables_text = ["Material particulado MP 10", "Material particulado MP 2,5", "Material particulado 10 micrometros discreto", 'Dióxido de azufre', 'Dióxido de nitrógeno', 'Óxidos de nitrógeno', 'Monóxido de nitrógeno', 'Monóxido de carbono', 'Ozono', 'Plomo', 'Arsénico', 'Cobre']
time_resolutions_website = ["registro diario", "registro horario"]
variables_ghost = ['PM10', 'PM2.5', 'PM10disc', 'SO2', 'NO2', 'NOX', 'NO','CO', 'O3','Pb', 'As', 'Cu'] # needs to be same order as variables_website!
time_resolution_ghost = ['daily', 'hourly']
baseurl = 'https://sinca.mma.gob.cl/index.php/'
today = date.today()
metadata15 = pd.read_csv("/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/temp/CHILE_SINCA_META_temp.txt")
#print(metadata15)
instruments_old = ['O3_instrument','NO_instrument','NO2_instrument','CO_instrument','CH4_instrument','SO2_instrument','NMHC_instrument','HC_instrument','PM10_instrument','PM2.5_instrument','As_instrument','Cu_instrument','Pb_instrument']
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'r', encoding='utf-8') as f:
metadata_old = json.loads(f.read())
n_tries = 0
errcode = 999
metadata_new = {}
while (n_tries < n_max_tries) and (errcode != 200):
try:
# set up driver
options = Options()
#prefs = {'download.default_directory' : download_location}
#options.add_experimental_option('prefs', prefs)
options.add_argument("--no-sandbox")
options.add_argument("--headless")
svc = webdriver.ChromeService(executable_path=binary_path)
driver = webdriver.Chrome(service=svc, options=options)
# open url
driver.get(baseurl)
WebDriverWait(driver, max_time_per_dl).until(EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/index.php/region/index/id/")]'))) # wait till loaded
# navigate to regions
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
regions = soup.find_all("a", href=re.compile(r"^/index.php/region/index/id/"))
#regions = soup.find_all("a", {"href": "/index.php/region/index/id/II"}) # Antofagasta
driver.get("https://sinca.mma.gob.cl/"+region.get("href"))
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
stations = soup.find_all("a", href=re.compile(r"^/index.php/estacion/index/id/"))
station_name_new = station.getText()
print(station_name_new)
driver.get("https://sinca.mma.gob.cl/"+station.get("href"))
time.sleep(3)
# get meta info
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
region = soup.find("th", text="Región").find_next_sibling().getText()
province = soup.find("th", text="Provincia").find_next_sibling().getText()
commune = soup.find("th", text="Comuna").find_next_sibling().getText()
UTM_coordinates = soup.find("th", text="Coordenadas UTM").find_next_sibling().getText().replace(' ', '')
lon = UTM_coordinates.split('E')[0]+'E'
lat = UTM_coordinates.split('E')[1].split("\n")[0]
timezone = soup.find("th", text="Huso horario").find_next_sibling().getText().replace(' ', '')
if ins_table is not None: # check if there are instruments for air pollution at this station
instruments = ins_table.find_all("td", {"class": "helpTecnica center"})
instruments_per_component = {}
else:
continue
for instrument in instruments:
component = instrument.find_parent().find('a').getText()
component = variables_ghost[variables_text.index(component)]
except:
try:
component = variables_ghost[variables_website.index(component)]
except:
pass
if 'Ozono.-' in component:
component = 'O3'
#======
if "No informado" in instrument.getText():
else:
instrument_name = re.sub(' +', ' ', instrument.getText())
instrument_name = instrument_name.split("\n")[-1]
instruments_per_component[component] = instrument_name
for station_reference in metadata_old:
if metadata_old[station_reference]['station_name']['values'][0] == station_name_new: # match station with previously referenced station reference from old file
i=0
metadata_new[station_reference] = {} # create inner dictionary
scraped_metadata = [station_reference, station_name_new, region, province, commune, lon, lat, timezone]
for parameter in metadata_old[station_reference]: # loop through the meta parameters
if ("instrument" not in parameter) and ("comments" not in parameter): # go through all that are not instruments
metadata_new[station_reference][parameter] = {"values": [scraped_metadata[i]], "update_time": [today.strftime('%Y-%m-%d')]}
i=i+1
metadata_new[station_reference][parameter] = {"values": [''], "update_time": [today.strftime('%Y-%m-%d')]}
else: # go through the instruments
for component in instruments_per_component:
if component in parameter:
metadata_new[station_reference][parameter] = {"values": [instruments_per_component[component]], "update_time": [today.strftime('%Y-%m-%d')]}
metadata_new[station_reference][parameter] = {"values": [''], "update_time": [today.strftime('%Y-%m-%d')]}
# safe
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META_{}.json'.format(today.strftime('%Y%m%d')), 'w', encoding='utf-8') as f:
f.write(json.dumps(metadata_new, indent=4, ensure_ascii=False))
driver.close()
errcode = 200
except WebDriverException as e:
print(e)
n_tries = n_tries+1
print("Number of tries: {}".format(n_tries))
continue
if n_tries == n_max_tries:
print('Failed downloading CHILE_SINCA metadata {} times in {} seconds'.format(n_tries, max_time_per_dl))"""
"""
# create json from original metadata file =====================================================================================
json_metadata = {}
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META.txt', 'r') as file:
csv_filedata = csv.DictReader(file)
for row in csv_filedata:
key = row['station_reference']
update_date = today.strftime('%Y-%m-%d')
for parameter in row:
row[parameter] = {'values': [row[parameter]], 'update_time': [update_date]} # create inner dictionary for every parameter
json_metadata[key] = row
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'w', encoding='utf-8') as f:
# read newly scraped file
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/network_provided/CHILE_SINCA_META_{}.json'.format(today.strftime('%Y%m%d')), 'r', encoding='utf-8') as f:
json_metadata_now = json.loads(f.read())
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
# read standardised file to compare!
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'r', encoding='utf-8') as f:
json_metadata = json.loads(f.read())
for station in json_metadata: # loop through all the old stations
if station in json_metadata_now.keys(): # if station is in current meta data, go on
for parameter in json_metadata[station]:
if json_metadata[station][parameter]['values'][-1] != json_metadata_now[station][parameter]['values'][0]: # compare last entry in standardised file to value in new file
# if different value, append the standardised metadeta file
print("old {} --- new {}".format(json_metadata[station][parameter]['values'][-1], json_metadata_now[station][parameter]['values'][0]))
json_metadata[station][parameter]['values'].append(json_metadata_now[station][parameter]['values'][0])
json_metadata[station][parameter]['update_time'].append(json_metadata_now[station][parameter]['update_time'][0])
else:
pass
else:
print('Station {} was abolished'.format(station))
for station in json_metadata_now: # loop through all the new stations
if station in json_metadata.keys(): # if station is in old meta data
pass # comparison was done before
else: # new station appeared!
print('New station {}'.format(station))
json_metadata.update({station: json_metadata_now[station]})
# safe
with open('/esarchive/obs/ghost/CHILE_SINCA/metadata/processed/CHILE_SINCA_META.json', 'w', encoding='utf-8') as f: