Source code for dimcli.utils.dim_utils

"""
Dimcli utilities for querying and working with Dimensions data.  
NOTE: these functions are attached to the top level ``dimcli.utils`` module. So you can load them as follows:

>>> from dimcli.utils import *

"""


import click
import time
import json
import sys
import os





[docs]def gen_dslqueries(sources, text="Albert Einstein"): """Generate test DSL queries for each source eg >>> from dimcli import G >>> gen_dslqueries(G.sources()) """ _q = """ search {} for "{}" return {}[basics] limit 10 """ out = [] for source in sources: out += [_q.format(source, source)] return reversed(out)
[docs]def dslquery(query_string): """Shortcut for running a query without instantiating dimcli.Dsl(). Added for backward compatibility with legacy API tutorials. Requires file-based credentials for logging in. Parameters ---------- query_string: str A valid DSL query. Returns ------- DslDataset A Dimcli wrapper object containing JSON data. """ from ..core.auth import is_logged_in_globally as is_logged_in from ..core.api import Dsl if is_logged_in(): dsl = Dsl() res = dsl.query(query_string, verbose=True) return res
[docs]def dslquery_json(query_string): """Shortcut for running a query without instantiating dimcli.Dsl(). Same as ``dslquery`` but returns raw JSON instead of Api.DslDataset object Added for backward compatibility with legacy API tutorials. Requires file-based credentials for logging in. Parameters ---------- query_string: str A valid DSL query. Returns ------- Dict API JSON data, represented as a dict object. """ from ..core.auth import is_logged_in_globally as is_logged_in from ..core.api import Dsl if is_logged_in(): dsl = Dsl() return dsl.query(query_string).json
[docs]def dslqueryall(query_string): """Shortcut for running a loop query without instantiating dimcli.Dsl(). Added for backward compatibility with legacy API tutorials. Requires file-based credentials for logging in. Parameters ---------- query_string: str A valid DSL query. Returns ------- DslDataset A Dimcli wrapper object containing JSON data. """ from ..core.auth import is_logged_in_globally as is_logged_in from ..core.api import Dsl if is_logged_in(): dsl = Dsl() return dsl.query_iterative(query_string)
[docs]def dimensions_url(obj_id, obj_type="", verbose=True): """Generate a valid Dimensions URL for one of the available sources. Parameters ---------- obj_id: str A Dimensions ID for one of the available sources. obj_type: str, optional The name of the source: one of 'publications', 'grants', 'patents', 'policy_documents', 'clinical_trials', 'researchers'. If not provided, it's inferred using the ID structure. Returns ------- str The object URL. Example ---------- >>> from dimcli.utils import dimensions_url >>> dimensions_url("pub.1127419018") 'https://app.dimensions.ai/details/publication/pub.1127419018' """ from ..core.dsl_grammar import G if type(obj_id) != str: return "" obj_id = obj_id.strip() if obj_type and (obj_type not in G.sources()): obj_type = "" # raise ValueError("ERROR: valid sources are: " + " ".join([x for x in G.sources()])) if not obj_type: # then infer it from the ID for source, prefix in G.object_id_patterns().items(): # print("Inferring source from ID: {}".format(obj_id), source, prefix) if obj_id.startswith(prefix): # print("Inferred source: {}".format(source)) obj_type = source if obj_type: url = G.url_for_source(obj_type) if url: return url + str(obj_id)
[docs]def dimensions_search_url(keywords_list_as_string): """Generate a valid keyword search URL for Dimensions. Parameters ---------- keywords_list_as_string: str List of search keywords. Returns ------- str The Dimensions URL. Example ---------- >>> from dimcli.utils import dimensions_search_url >>> dimensions_search_url("graphene AND south korea") 'https://app.dimensions.ai/discover/publication?search_text=graphene%20AND%20south%20korea&search_type=kws&search_field=full_search' """ q = """https://app.dimensions.ai/discover/publication?search_text={}&search_type=kws&search_field=full_search""" from urllib.parse import quote s = quote(keywords_list_as_string) return q.format(s)
[docs]def dsl_escape(stringa, all=False): """Helper for escaping the full-text inner query strings, when they includes quotes. EG with the query string: '"2019-nCoV" OR "COVID-19" OR "SARS-CoV-2" OR (("coronavirus" OR "corona virus") AND (Wuhan OR China))' In Python, if you want to embed it into a DSL query, it has to become: '\\"2019-nCoV\\" OR \\"COVID-19\\" OR \\"SARS-CoV-2\\" OR ((\\"coronavirus\\" OR \\"corona virus\\") AND (Wuhan OR China))' See also: https://docs.dimensions.ai/dsl/language.html#for-search-term Parameters ---------- stringa: str Full-text search component of a DSL query. all: bool, default=False By default only quotes as escaped. Set to True to escape all special characters (eg colons) Example ---------- >>> dsl_escape('Solar cells: a new technology?', True) 'Solar cells\\: a new technology?' """ if all: escaped = stringa.translate(str.maketrans({"^": r"\^", '"': r'\"', "\\": r"\\", ":": r"\:", "~": r"\~", "[": r"\[", "]": r"\]", "{": r"\{", "}": r"\}", "(": r"\(", ")": r"\)", "!": r"\!", "|": r"\|", "&": r"\&", "+": r"\+", })) else: escaped = stringa.translate(str.maketrans({'"': r'\"'})) return escaped
[docs]def dimensions_styler(df, source_type="", title_links=True): """Format the text display value of a dataframe by including Dimensions hyperlinks whenever possible. Useful mainly in notebooks when printing out dataframes and clicking on links etc.. Expects column names to match the default DSL field names. Parameters ---------- df: pd.Dataframe Pandas dataframe obtained from a DSL query e.g. via the `as_dataframe` methods. source_type: str, optional The name of the source: one of 'publications', 'grants', 'patents', 'policy_documents', 'clinical_trials', 'researchers'. If not provided, it can be inferred in some cases. title_links: bool, optional, True Hyperlink document titles too, using the ID (if available). Notes ----- Implemented using https://pandas.pydata.org/docs/reference/api/pandas.io.formats.style.Styler.format.html. Side effect is that the resulting dataframe becomes an instance of pandas.io.formats.style.styler, which is a wrapper around the underlying Styler object. TIP To get back to the original dataframe, you can use the `.data` method. See also: https://stackoverflow.com/questions/42263946/how-to-create-a-table-with-clickable-hyperlink-in-pandas-jupyter-notebook Returns ------- pandas.io.formats.style.Styler Wrapper for a dataframe object, including custom Dimensions hyperlinks. Example ------- >>> from dimcli.utils import dimensions_styler >>> dsl = dimcli.Dsl() >>> q = 'search publications for "scientometrics" return publications[basics]' >>> df = dsl.query(q).as_dataframe() >>> dimensions_styler(df) # # alternatively, using the shortcut method: # >>> dsl.query(q).as_dataframe(links=True) """ format_rules = {} cols = [x.lower() for x in df.columns] REPLACE_TITLES = title_links cols_to_drop = [] def df_value_as_link(url, val, url_root="", verbose=False): """Generic method to create an HTML hyperlink from a dataframe cell value and a URL. If cell value is list, we just take the first element (e.g. for 'linkout' field). NOTE If cell value is a float, it means it's a Pandas NaN. So we don't want to return a link. """ if verbose: print(f"""url: {url} / val: {val} / url_root: {url_root}""") if not val or type(val) == float: return val if type(val) == list: url = val[0] if url_root: url = url_root + url elif "###" in val: # title URL val, url = val.split("###") return '<a target="_blank" href="{}">{}</a>'.format(url, val) def df_format_gridids(val, verbose=False): """Version of df_value_as_link() for lists of GRID IDs. NOTE If cell value is a float, it means it's a Pandas NaN. So we don't want to return a link. val: string List of GRID IDs separated by ';' (normal output of 'nice' converters) """ if verbose: print(f"""val: {val} """) if not val or type(val) == float: return val grids = val.split(";") z = ['<a target="_blank" href="{}">{}</a>'.format(dimensions_url(g.strip(), "organizations"), g) for g in grids] return "; ".join(z) # TRANSFORMATIONS # NOTE multiple naming supported, so to handle columm conversions obtained via --nice flag for col in ["dimensions_url", 'Dimensions URL']: if col.lower() in cols: cols_to_drop += [col] # always drop cause IDs get linked already # format_rules[col] = lambda x: df_value_as_link(x, x) for col in ["linkout", "Source Linkout", "Linkout"]: if col.lower() in cols: # ps this is a list, only first el will be used format_rules[col] = lambda x: df_value_as_link(x, x) for col in ["doi", 'DOI']: if col.lower() in cols: url_root = "https://doi.org/" format_rules[col] = lambda x: df_value_as_link(x, x, url_root) for col in ["id", 'Publication ID', 'Patent ID', 'Dataset ID', 'Trial ID', 'Policy ID', 'Grant ID', 'GRID ID', 'Researcher ID', 'Report ID']: if col.lower() in cols: # print("Matched =", col) format_rules[col] = lambda x: df_value_as_link(dimensions_url(x, source_type), x) # HYPERLINK THE TITLE AS WELL, USING THE ID # create a new col with URL+title and split it when formatting the table if REPLACE_TITLES: title_names = ["title", "Title", "name", "Name"] for t in title_names: if t in df.columns: # print("Matched Title=", t, source_type) df[t] = df[t] + '###' + df[col].apply(lambda x: dimensions_url(x, source_type)) format_rules[t] = lambda x: df_value_as_link(x, x) cols_to_drop += [col] for col in ["journal.id", 'Source ID']: if col.lower() in cols: format_rules[col] = lambda x: df_value_as_link(dimensions_url(x, "source_titles"), x) # HYPERLINK THE TITLE AS WELL, USING THE ID # create a new col with URL+title and split it when formatting the table if REPLACE_TITLES: title_names = ["journal.title", "Source title"] for t in title_names: if t in df.columns: df[t] = df[t] + '###' + df[col].apply(lambda x: dimensions_url(x, "source_titles")) format_rules[t] = lambda x: df_value_as_link(x, x) cols_to_drop += [col] # denorm data for cols resulting from dimcli df methods # TODO more testing needed if "pub_id" in cols: format_rules["pub_id"] = lambda x: df_value_as_link(dimensions_url(x, "publications"), x) if "researcher_id" in cols: format_rules["researcher_id"] = lambda x: df_value_as_link(dimensions_url(x, "researchers"), x) if "grant_id" in cols: format_rules["grant_id"] = lambda x: df_value_as_link(dimensions_url(x, "grants"), x) if "aff_id" in cols: format_rules["aff_id"] = lambda x: df_value_as_link(dimensions_url(x, "organizations"), x) if "current_organization_id" in cols: format_rules["current_organization_id"] = lambda x: df_value_as_link(dimensions_url(x, "organizations"), x) for col in ["orcid_id", 'Orcid IDs']: if col.lower() in cols: url_root = "https://orcid.org/" format_rules[col] = lambda x: df_value_as_link(x, x, url_root) for col in ["GRID IDs", "Funders GRID IDs", "Assignees GRID IDs"]: if col.lower() in cols: format_rules[col] = lambda x: df_format_gridids(x) df = df.style.format(format_rules) if cols_to_drop: df = df.hide_columns(cols_to_drop) return df