Authors¶
Preamble¶
Datasets in LiPD format have been made available through a knowledge graph, which we call the LiPDGraph, that can be queried directly through it SPARQL endpoint. This notebook walks through the steps to create useful queries on the graph. Although it results in one query, it is presented as a didactic example of the type of thinking that goes into graph querying.
More information about PaleoPAL can be found here.
Technical skills involved:
Understanding the graph representation of LiPD datasets.
Some basic knowledge of the SPARQL language
Data¶
The data is from the Temperature 12k Database by Kaufman et al. (2020), stored in the Linked PaleoData (LiPD) format on the LiPDGraph.
Reading time¶
10 min
Keywords¶
LiPD; SPARQL
Let’s import the necessary packages:
# To query the graph
import json
import requests
import pandas as pd
import ioAnalysis¶
agent sparql Return the following information for all the datasets in the Temp12k compilation with a StandardName as temperature: the name of the dataset as dataSetName, archiveType, Latitude, Longitude and Elevation (optional for elevation), the variable name (as paleoData_variableName), the standard name (as paleoData_standardName), the values (as paleoData_values), units (as paleoData_units, optional), the proxy as paleoData_proxy (optional), the general proxy as paleoData_proxyGeneral (optional), TSiD. the associated time variable as time_variableName, the time standard name (either year or age) as time_standardName, the time values as time_values and time_units as time_units
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
def fetch_sparql(endpoint_url: str, query: str) -> pd.DataFrame:
sparql = SPARQLWrapper(endpoint_url)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
# Get variable names (column names)
cols = results["head"]["vars"]
# Build rows
rows = []
for result in results["results"]["bindings"]:
row = {}
for col in cols:
if col in result:
row[col] = result[col]["value"]
else:
row[col] = None
rows.append(row)
return pd.DataFrame(rows)
endpoint = "https://linkedearth.graphdb.mint.isi.edu/repositories/LiPDVerse-dynamic"
query = """PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX le: <http://linked.earth/ontology#>
PREFIX pvar: <http://linked.earth/ontology/paleo_variables#>
PREFIX pproxy: <http://linked.earth/ontology/paleo_proxy#>
PREFIX arch: <http://linked.earth/ontology/archive#>
PREFIX punits: <http://linked.earth/ontology/paleo_units#>
PREFIX interp: <http://linked.earth/ontology/interpretation#>
SELECT DISTINCT
?dataSetName
?archiveType
?Latitude
?Longitude
?Elevation
?paleoData_variableName
?paleoData_standardName
?paleoData_values
?paleoData_units
?paleoData_proxy
?paleoData_proxyGeneral
?TSiD
?time_variableName
?time_standardName
?time_values
?time_units
WHERE {
?dataset a le:Dataset ;
le:hasName ?dataSetName ;
le:hasArchiveType ?archiveType ;
le:hasLocation ?loc ;
le:hasPaleoData ?paleoData .
?loc a le:Location ;
le:hasLatitude ?Latitude ;
le:hasLongitude ?Longitude .
OPTIONAL { ?loc le:hasElevation ?Elevation . }
?paleoData a le:PaleoData ;
le:hasMeasurementTable ?table .
?table a le:DataTable .
# Temperature variable in Temp12k compilation
?table le:hasVariable ?tempVar .
?tempVar a le:Variable ;
le:hasName ?paleoData_variableName ;
le:hasValues ?paleoData_values ;
le:hasVariableId ?TSiD ;
le:hasInterpretation ?interp ;
le:hasStandardVariable ?paleoData_standardName ;
le:partOfCompilation ?comp .
?interp a le:Interpretation ;
le:hasVariable interp:temperature .
FILTER(?paleoData_standardName = pvar:temperature)
OPTIONAL { ?tempVar le:hasUnits ?paleoData_units . }
OPTIONAL { ?tempVar le:hasProxy ?paleoData_proxy . }
OPTIONAL { ?tempVar le:hasProxyGeneral ?paleoData_proxyGeneral . }
?comp a le:Compilation ;
le:hasName ?compName .
FILTER(regex(?compName, "Temp.?12k", "i"))
# Associated time variable
?table le:hasVariable ?timeVar .
?timeVar a le:Variable ;
le:hasName ?time_variableName ;
le:hasStandardVariable ?time_standardName ;
le:hasValues ?time_values ;
le:hasUnits ?time_units .
FILTER(?time_standardName IN (pvar:age, pvar:year))
}"""
df_search = fetch_sparql(endpoint, query) # rename dataframe to df_search
display(df_search.head()) # change "print" to "display"
Let’s see how many results we have:
len(df_search)1580The results seem to also be filtered by interpretation and I would rather have the labels for the archiveType, proxy and variable name and units. Let’s ask PaleoPAL to rewrite this query:
agent sparql In the above query, do not look for the interpretation as temperature field. Only use the variableName. Also return the labels instead of the URI for archiveType, units, standardName, proxy and proxyGeneral. I want the labels to be returned as “archiveType” so create an intermediate object for the URI.
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
def fetch_sparql(endpoint_url: str, query: str) -> pd.DataFrame:
sparql = SPARQLWrapper(endpoint_url)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
# Get variable names (column names)
cols = results["head"]["vars"]
# Build rows
rows = []
for result in results["results"]["bindings"]:
row = {}
for col in cols:
if col in result:
row[col] = result[col]["value"]
else:
row[col] = None
rows.append(row)
return pd.DataFrame(rows)
endpoint = "https://linkedearth.graphdb.mint.isi.edu/repositories/LiPDVerse-dynamic"
query = """PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX le: <http://linked.earth/ontology#>
PREFIX pvar: <http://linked.earth/ontology/paleo_variables#>
PREFIX pproxy: <http://linked.earth/ontology/paleo_proxy#>
PREFIX arch: <http://linked.earth/ontology/archive#>
PREFIX punits: <http://linked.earth/ontology/paleo_units#>
SELECT DISTINCT
?dataSetName
?archiveType
?Latitude
?Longitude
?Elevation
?paleoData_variableName
?paleoData_standardName
?paleoData_values
?paleoData_units
?paleoData_proxy
?paleoData_proxyGeneral
?TSiD
?time_variableName
?time_standardName
?time_values
?time_units
WHERE {
?dataset a le:Dataset ;
le:hasName ?dataSetName ;
le:hasArchiveType ?archiveTypeURI ;
le:hasLocation ?loc ;
le:hasPaleoData ?paleoData .
?archiveTypeURI rdfs:label ?archiveType .
?loc a le:Location ;
le:hasLatitude ?Latitude ;
le:hasLongitude ?Longitude .
OPTIONAL { ?loc le:hasElevation ?Elevation . }
?paleoData a le:PaleoData ;
le:hasMeasurementTable ?table .
?table a le:DataTable .
# Select variables by their name only (no interpretation constraint)
?table le:hasVariable ?paleoVar .
?paleoVar a le:Variable ;
le:hasName ?paleoData_variableName ;
le:hasValues ?paleoData_values ;
le:hasVariableId ?TSiD ;
le:partOfCompilation ?comp .
FILTER(regex(?paleoData_variableName, "temp", "i"))
OPTIONAL {
?paleoVar le:hasStandardVariable ?paleoData_standardNameURI .
?paleoData_standardNameURI rdfs:label ?paleoData_standardName .
}
OPTIONAL {
?paleoVar le:hasUnits ?paleoData_unitsURI .
?paleoData_unitsURI rdfs:label ?paleoData_units .
}
OPTIONAL {
?paleoVar le:hasProxy ?paleoData_proxyURI .
?paleoData_proxyURI rdfs:label ?paleoData_proxy .
}
OPTIONAL {
?paleoVar le:hasProxyGeneral ?paleoData_proxyGeneralURI .
?paleoData_proxyGeneralURI rdfs:label ?paleoData_proxyGeneral .
}
?comp a le:Compilation ;
le:hasName ?compName .
#FILTER(regex(?compName, "Temp.?12k", "i"))
FILTER(?compName = 'Temp12k')
# Associated time variable with label returns
?table le:hasVariable ?timeVar .
?timeVar a le:Variable ;
le:hasName ?time_variableName ;
le:hasStandardVariable ?time_standardNameURI ;
le:hasValues ?time_values ;
le:hasUnits ?time_unitsURI .
FILTER(?time_standardNameURI IN (pvar:age, pvar:year))
?time_standardNameURI rdfs:label ?time_standardName .
?time_unitsURI rdfs:label ?time_units .
}"""
sparql_results_1767729896_fbfadce4 = fetch_sparql(endpoint, query)
#print(sparql_results_1767729896_fbfadce4.head())
display(sparql_results_1767729896_fbfadce4.head())len(sparql_results_1767729896_fbfadce4)65184There seems to be a lot of duplicates. Let’s filter by unique TSids.
agent code Filter the sparql_results_1767729896_fbfadce4 dataframe by unique TSiD
# Drops duplicate rows in sparql_results_1767729896_fbfadce4 based on the TSiD column, keeping the first occurrence. Reports counts before and after, and displays a preview of the filtered dataframe.
# Auto-generated by PaleoPal CodeGenerationAgent with contextual search
# Filter the sparql_results_1767729896_fbfadce4 dataframe to unique TSiD values
import pandas as pd
from IPython.display import display
# Record initial number of rows
_before_n = len(sparql_results_1767729896_fbfadce4)
# Keep the first occurrence for each unique TSiD
sparql_results_1767729896_fbfadce4_unique_TSiD = (
sparql_results_1767729896_fbfadce4.drop_duplicates(subset=["TSiD"]).reset_index(drop=True)
)
# Report counts before and after
after_n = len(sparql_results_1767729896_fbfadce4_unique_TSiD)
print(f"Rows before: {_before_n}")
print(f"Rows after unique TSiD: {after_n}")
print(f"Duplicates removed: {_before_n - after_n}")
# Preview the filtered dataframe
display(sparql_results_1767729896_fbfadce4_unique_TSiD.head())
Rows before: 65184
Rows after unique TSiD: 1279
Duplicates removed: 63905
agent sparql can you rewrite the query so only variables associated with age (not year) are returned? and ensure that the TSiD are unique.
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
def fetch_sparql(endpoint_url: str, query: str) -> pd.DataFrame:
sparql = SPARQLWrapper(endpoint_url)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
# Get variable names (column names)
cols = results["head"]["vars"]
# Build rows
rows = []
for result in results["results"]["bindings"]:
row = {}
for col in cols:
if col in result:
row[col] = result[col]["value"]
else:
row[col] = None
rows.append(row)
return pd.DataFrame(rows)
endpoint = "https://linkedearth.graphdb.mint.isi.edu/repositories/LiPDVerse-dynamic"
query = """PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX le: <http://linked.earth/ontology#>
PREFIX pvar: <http://linked.earth/ontology/paleo_variables#>
PREFIX pproxy: <http://linked.earth/ontology/paleo_proxy#>
PREFIX arch: <http://linked.earth/ontology/archive#>
PREFIX punits: <http://linked.earth/ontology/paleo_units#>
SELECT
?TSiD
(SAMPLE(?dataSetName) AS ?dataSetName)
(SAMPLE(?archiveType) AS ?archiveType)
(SAMPLE(?Latitude) AS ?Latitude)
(SAMPLE(?Longitude) AS ?Longitude)
(SAMPLE(?Elevation) AS ?Elevation)
(SAMPLE(?paleoData_variableName) AS ?paleoData_variableName)
(SAMPLE(?paleoData_standardName) AS ?paleoData_standardName)
(SAMPLE(?paleoData_values) AS ?paleoData_values)
(SAMPLE(?paleoData_units) AS ?paleoData_units)
(SAMPLE(?paleoData_proxy) AS ?paleoData_proxy)
(SAMPLE(?paleoData_proxyGeneral) AS ?paleoData_proxyGeneral)
(SAMPLE(?time_variableName) AS ?time_variableName)
(SAMPLE(?time_standardName) AS ?time_standardName)
(SAMPLE(?time_values) AS ?time_values)
(SAMPLE(?time_units) AS ?time_units)
WHERE {
?dataset a le:Dataset ;
le:hasName ?dataSetName ;
le:hasArchiveType ?archiveTypeURI ;
le:hasLocation ?loc ;
le:hasPaleoData ?paleoData .
?archiveTypeURI rdfs:label ?archiveType .
?loc a le:Location ;
le:hasLatitude ?Latitude ;
le:hasLongitude ?Longitude .
OPTIONAL { ?loc le:hasElevation ?Elevation . }
?paleoData a le:PaleoData ;
le:hasMeasurementTable ?table .
?table a le:DataTable .
# Paleo variable filtered by name (temperature-like), no interpretation constraint
?table le:hasVariable ?paleoVar .
?paleoVar a le:Variable ;
le:hasName ?paleoData_variableName ;
le:hasValues ?paleoData_values ;
le:hasVariableId ?TSiD ;
le:partOfCompilation ?comp .
FILTER(regex(?paleoData_variableName, "temp", "i"))
OPTIONAL {
?paleoVar le:hasStandardVariable ?paleoData_standardNameURI .
?paleoData_standardNameURI rdfs:label ?paleoData_standardName .
}
OPTIONAL {
?paleoVar le:hasUnits ?paleoData_unitsURI .
?paleoData_unitsURI rdfs:label ?paleoData_units .
}
OPTIONAL {
?paleoVar le:hasProxy ?paleoData_proxyURI .
?paleoData_proxyURI rdfs:label ?paleoData_proxy .
}
OPTIONAL {
?paleoVar le:hasProxyGeneral ?paleoData_proxyGeneralURI .
?paleoData_proxyGeneralURI rdfs:label ?paleoData_proxyGeneral .
}
?comp a le:Compilation ; le:hasName ?compName .
FILTER(?compName = "Temp12k")
# Associated time variable must be age (exclude year)
?table le:hasVariable ?timeVar .
?timeVar a le:Variable ;
le:hasName ?time_variableName ;
le:hasStandardVariable ?time_standardNameURI ;
le:hasValues ?time_values ;
le:hasUnits ?time_unitsURI .
FILTER(?time_standardNameURI = pvar:age)
?time_standardNameURI rdfs:label ?time_standardName .
?time_unitsURI rdfs:label ?time_units .
}
GROUP BY ?TSiD"""
sparql_results_1767730535_65f2d398 = fetch_sparql(endpoint, query)
#print(sparql_results_1767730535_65f2d398.head())
display(sparql_results_1767730535_65f2d398.head())
Make sure we have the right number of timeseries:
len(sparql_results_1767730535_65f2d398)1279Make sure we have only age:
sparql_results_1767730535_65f2d398['time_standardName'].unique()array(['age'], dtype=object)Make sure we only have temperatures:
sparql_results_1767730535_65f2d398['paleoData_standardName'].unique()array(['temperature'], dtype=object)sparql_results_1767730535_65f2d398['paleoData_variableName'].unique()array(['temperature', 'temperatureComposite'], dtype=object)- Kaufman, D., McKay, N., Routson, C., Erb, M., Dätwyler, C., Sommer, P. S., Heiri, O., & Davis, B. (2020). Holocene global mean surface temperature, a multi-method reconstruction approach. Scientific Data, 7(1). 10.1038/s41597-020-0530-7