"""
Dimcli utilities for querying and working with Dimensions data.
NOTE: these functions are attached to the top level ``dimcli.utils`` module. So you can load them as follows:
>>> from dimcli.utils import *
"""
import click
import time
import json
import sys
import os
[docs]def gen_dslqueries(sources, text="Albert Einstein"):
"""Generate test DSL queries for each source
eg
>>> from dimcli import G
>>> gen_dslqueries(G.sources())
"""
_q = """ search {} for "{}" return {}[basics] limit 10 """
out = []
for source in sources:
out += [_q.format(source, source)]
return reversed(out)
[docs]def dslquery(query_string):
"""Shortcut for running a query without instantiating dimcli.Dsl().
Added for backward compatibility with legacy API tutorials. Requires file-based credentials for logging in.
Parameters
----------
query_string: str
A valid DSL query.
Returns
-------
DslDataset
A Dimcli wrapper object containing JSON data.
"""
from ..core.auth import is_logged_in_globally as is_logged_in
from ..core.api import Dsl
if is_logged_in():
dsl = Dsl()
res = dsl.query(query_string, verbose=True)
return res
[docs]def dslquery_json(query_string):
"""Shortcut for running a query without instantiating dimcli.Dsl(). Same as ``dslquery`` but returns raw JSON instead of Api.DslDataset object
Added for backward compatibility with legacy API tutorials. Requires file-based credentials for logging in.
Parameters
----------
query_string: str
A valid DSL query.
Returns
-------
Dict
API JSON data, represented as a dict object.
"""
from ..core.auth import is_logged_in_globally as is_logged_in
from ..core.api import Dsl
if is_logged_in():
dsl = Dsl()
return dsl.query(query_string).json
[docs]def dslqueryall(query_string):
"""Shortcut for running a loop query without instantiating dimcli.Dsl().
Added for backward compatibility with legacy API tutorials. Requires file-based credentials for logging in.
Parameters
----------
query_string: str
A valid DSL query.
Returns
-------
DslDataset
A Dimcli wrapper object containing JSON data.
"""
from ..core.auth import is_logged_in_globally as is_logged_in
from ..core.api import Dsl
if is_logged_in():
dsl = Dsl()
return dsl.query_iterative(query_string)
[docs]def dimensions_url(obj_id, obj_type="", verbose=True):
"""Generate a valid Dimensions URL for one of the available sources.
Parameters
----------
obj_id: str
A Dimensions ID for one of the available sources.
obj_type: str, optional
The name of the source: one of 'publications', 'grants', 'patents', 'policy_documents', 'clinical_trials', 'researchers'. If not provided, it's inferred using the ID structure.
Returns
-------
str
The object URL.
Example
----------
>>> from dimcli.utils import dimensions_url
>>> dimensions_url("pub.1127419018")
'https://app.dimensions.ai/details/publication/pub.1127419018'
"""
from ..core.dsl_grammar import G
if type(obj_id) != str:
return ""
obj_id = obj_id.strip()
if obj_type and (obj_type not in G.sources()):
obj_type = ""
# raise ValueError("ERROR: valid sources are: " + " ".join([x for x in G.sources()]))
if not obj_type: # then infer it from the ID
for source, prefix in G.object_id_patterns().items():
# print("Inferring source from ID: {}".format(obj_id), source, prefix)
if obj_id.startswith(prefix):
# print("Inferred source: {}".format(source))
obj_type = source
if obj_type:
url = G.url_for_source(obj_type)
if url:
return url + str(obj_id)
[docs]def dimensions_search_url(keywords_list_as_string):
"""Generate a valid keyword search URL for Dimensions.
Parameters
----------
keywords_list_as_string: str
List of search keywords.
Returns
-------
str
The Dimensions URL.
Example
----------
>>> from dimcli.utils import dimensions_search_url
>>> dimensions_search_url("graphene AND south korea")
'https://app.dimensions.ai/discover/publication?search_text=graphene%20AND%20south%20korea&search_type=kws&search_field=full_search'
"""
q = """https://app.dimensions.ai/discover/publication?search_text={}&search_type=kws&search_field=full_search"""
from urllib.parse import quote
s = quote(keywords_list_as_string)
return q.format(s)
[docs]def dsl_escape(stringa, all=False):
"""Helper for escaping the full-text inner query strings, when they includes quotes.
EG with the query string:
'"2019-nCoV" OR "COVID-19" OR "SARS-CoV-2" OR (("coronavirus" OR "corona virus") AND (Wuhan OR China))'
In Python, if you want to embed it into a DSL query, it has to become:
'\\"2019-nCoV\\" OR \\"COVID-19\\" OR \\"SARS-CoV-2\\" OR ((\\"coronavirus\\" OR \\"corona virus\\") AND (Wuhan OR China))'
See also: https://docs.dimensions.ai/dsl/language.html#for-search-term
Parameters
----------
stringa: str
Full-text search component of a DSL query.
all: bool, default=False
By default only quotes as escaped. Set to True to escape all special characters (eg colons)
Example
----------
>>> dsl_escape('Solar cells: a new technology?', True)
'Solar cells\\: a new technology?'
"""
if all:
escaped = stringa.translate(str.maketrans({"^": r"\^",
'"': r'\"',
"\\": r"\\",
":": r"\:",
"~": r"\~",
"[": r"\[",
"]": r"\]",
"{": r"\{",
"}": r"\}",
"(": r"\(",
")": r"\)",
"!": r"\!",
"|": r"\|",
"&": r"\&",
"+": r"\+",
}))
else:
escaped = stringa.translate(str.maketrans({'"': r'\"'}))
return escaped
[docs]def dimensions_styler(df, source_type="", title_links=True):
"""Format the text display value of a dataframe by including Dimensions hyperlinks whenever possible.
Useful mainly in notebooks when printing out dataframes and clicking on links etc..
Expects column names to match the default DSL field names.
Parameters
----------
df: pd.Dataframe
Pandas dataframe obtained from a DSL query e.g. via the `as_dataframe` methods.
source_type: str, optional
The name of the source: one of 'publications', 'grants', 'patents', 'policy_documents', 'clinical_trials', 'researchers'. If not provided, it can be inferred in some cases.
title_links: bool, optional, True
Hyperlink document titles too, using the ID (if available).
Notes
-----
Implemented using https://pandas.pydata.org/docs/reference/api/pandas.io.formats.style.Styler.format.html. Side effect is that the resulting dataframe becomes an instance of pandas.io.formats.style.styler, which is a wrapper around the underlying Styler object. TIP To get back to the original dataframe, you can use the `.data` method.
See also: https://stackoverflow.com/questions/42263946/how-to-create-a-table-with-clickable-hyperlink-in-pandas-jupyter-notebook
Returns
-------
pandas.io.formats.style.Styler
Wrapper for a dataframe object, including custom Dimensions hyperlinks.
Example
-------
>>> from dimcli.utils import dimensions_styler
>>> dsl = dimcli.Dsl()
>>> q = 'search publications for "scientometrics" return publications[basics]'
>>> df = dsl.query(q).as_dataframe()
>>> dimensions_styler(df)
#
# alternatively, using the shortcut method:
#
>>> dsl.query(q).as_dataframe(links=True)
"""
format_rules = {}
cols = [x.lower() for x in df.columns]
REPLACE_TITLES = title_links
cols_to_drop = []
def df_value_as_link(url, val, url_root="", verbose=False):
"""Generic method to create an HTML hyperlink from a dataframe cell value and a URL.
If cell value is list, we just take the first element (e.g. for 'linkout' field).
NOTE If cell value is a float, it means it's a Pandas NaN. So we don't want to return a link.
"""
if verbose: print(f"""url: {url} / val: {val} / url_root: {url_root}""")
if not val or type(val) == float:
return val
if type(val) == list:
url = val[0]
if url_root:
url = url_root + url
elif "###" in val: # title URL
val, url = val.split("###")
return '<a target="_blank" href="{}">{}</a>'.format(url, val)
def df_format_gridids(val, verbose=False):
"""Version of df_value_as_link() for lists of GRID IDs.
NOTE If cell value is a float, it means it's a Pandas NaN. So we don't want to return a link.
val: string
List of GRID IDs separated by ';' (normal output of 'nice' converters)
"""
if verbose: print(f"""val: {val} """)
if not val or type(val) == float:
return val
grids = val.split(";")
z = ['<a target="_blank" href="{}">{}</a>'.format(dimensions_url(g.strip(), "organizations"), g) for g in grids]
return "; ".join(z)
# TRANSFORMATIONS
# NOTE multiple naming supported, so to handle columm conversions obtained via --nice flag
for col in ["dimensions_url", 'Dimensions URL']:
if col.lower() in cols:
cols_to_drop += [col] # always drop cause IDs get linked already
# format_rules[col] = lambda x: df_value_as_link(x, x)
for col in ["linkout", "Source Linkout", "Linkout"]:
if col.lower() in cols:
# ps this is a list, only first el will be used
format_rules[col] = lambda x: df_value_as_link(x, x)
for col in ["doi", 'DOI']:
if col.lower() in cols:
url_root = "https://doi.org/"
format_rules[col] = lambda x: df_value_as_link(x, x, url_root)
for col in ["id", 'Publication ID', 'Patent ID', 'Dataset ID', 'Trial ID',
'Policy ID', 'Grant ID', 'GRID ID', 'Researcher ID', 'Report ID']:
if col.lower() in cols:
# print("Matched =", col)
format_rules[col] = lambda x: df_value_as_link(dimensions_url(x, source_type), x)
# HYPERLINK THE TITLE AS WELL, USING THE ID
# create a new col with URL+title and split it when formatting the table
if REPLACE_TITLES:
title_names = ["title", "Title", "name", "Name"]
for t in title_names:
if t in df.columns:
# print("Matched Title=", t, source_type)
df[t] = df[t] + '###' + df[col].apply(lambda x: dimensions_url(x, source_type))
format_rules[t] = lambda x: df_value_as_link(x, x)
cols_to_drop += [col]
for col in ["journal.id", 'Source ID']:
if col.lower() in cols:
format_rules[col] = lambda x: df_value_as_link(dimensions_url(x, "source_titles"), x)
# HYPERLINK THE TITLE AS WELL, USING THE ID
# create a new col with URL+title and split it when formatting the table
if REPLACE_TITLES:
title_names = ["journal.title", "Source title"]
for t in title_names:
if t in df.columns:
df[t] = df[t] + '###' + df[col].apply(lambda x: dimensions_url(x, "source_titles"))
format_rules[t] = lambda x: df_value_as_link(x, x)
cols_to_drop += [col]
# denorm data for cols resulting from dimcli df methods
# TODO more testing needed
if "pub_id" in cols:
format_rules["pub_id"] = lambda x: df_value_as_link(dimensions_url(x, "publications"), x)
if "researcher_id" in cols:
format_rules["researcher_id"] = lambda x: df_value_as_link(dimensions_url(x, "researchers"), x)
if "grant_id" in cols:
format_rules["grant_id"] = lambda x: df_value_as_link(dimensions_url(x, "grants"), x)
if "aff_id" in cols:
format_rules["aff_id"] = lambda x: df_value_as_link(dimensions_url(x, "organizations"), x)
if "current_organization_id" in cols:
format_rules["current_organization_id"] = lambda x: df_value_as_link(dimensions_url(x, "organizations"), x)
for col in ["orcid_id", 'Orcid IDs']:
if col.lower() in cols:
url_root = "https://orcid.org/"
format_rules[col] = lambda x: df_value_as_link(x, x, url_root)
for col in ["GRID IDs", "Funders GRID IDs", "Assignees GRID IDs"]:
if col.lower() in cols:
format_rules[col] = lambda x: df_format_gridids(x)
df = df.style.format(format_rules)
if cols_to_drop:
df = df.hide_columns(cols_to_drop)
return df