Source code for dimcli.core.api

"""
Dimcli objects for querying the Dimensions API. 
NOTE: these objects are attached to the top level ``dimcli`` module. So you can load them as follows:

>>> import dimcli
>>> dsl = dimcli.Dsl()

"""


import requests
import time
import json
import IPython.display
from itertools import islice
import urllib.parse

import pandas as pd

from .auth import get_global_connection 
from .dsl_grammar import G
from .dataframe_factory import DfFactory

from ..utils.all import *



[docs]class Dsl():

    """The Dsl object is the main interface for interacting with the Dimensions API.

    Parameters
    ----------
    show_results : bool, default=False
        Set a global setting that determines whether query JSON results get printed out. Note that in Jupyter environments this is not needed, because iPython rich widgets are used by default.
    auth_session : APISession, default=False
        Set an authenticated session object that should be used for querying. Used only in special situations, as an alternative to the dimcli.login() utility method. 
    verbose : bool, default=True
        Verbose mode.

    Example
    -------
    >>> import dimcli
    >>> dimcli.login()
    >>> dsl = dimcli.Dsl()
    >>> dsl.query(\"\""search grants for "graphene" return researchers"\"\")
    <dimcli.dimensions.DslDataset object>
    >>> _.json
    >>> {'researchers': [{'id': 'ur.01332073522.49',
            'count': 75,
            'last_name': 'White',
            'first_name': 'Nicholas J'},
        "... JSON data continues ... "

    In some special situations, you'd want to query two separate Dimensions servers 
    in parallel. To that end, it is possible to pass an `APISession` instance to the `Dsl()` constructor
    using the `auth_session` parameter, IE:

    >>> import dimcli
    >>> from dimcli.core.auth import APISession 
    # set up first authentication backend
    >>> mysession1 = APISession()
    >>> mysession1.login(instance="app.dimensions.ai")
    >>> d1 = Dsl(auth_session=mysession1)
    >>> d1.query("search publications return research_orgs")
    # set up second authentication backend
    >>> mysession2 = APISession()
    >>> mysession2.login(instance="another-app.dimensions.ai")
    >>> d2 = Dsl(auth_session=mysession2)
    >>> d2.query("search publications return research_orgs")

    """

    def __init__(self, show_results=False, verbose=True, auth_session=False):
        """Initialises a Dsl object.

        """
        self._show_results = show_results
        self._verbose = verbose
        self._url = None
        self._headers = None
        self.verify_ssl = True
        if auth_session:
            self._CONNECTION = auth_session 
        else:
            self._CONNECTION = get_global_connection()

        if self._CONNECTION.token:
            # if already logged in, reuse connection          
            self._url = self._CONNECTION.url
            self._headers = {'Authorization': "JWT " + self._CONNECTION.token}
            self.verify_ssl = self._CONNECTION.verify_ssl
        else:
            self._print_please_login()

    @property
    def is_logged_in(self):
        if self._url and  self._headers: return True
        else: return False

    def _print_please_login(self):
        printDebug("Warning: you are not logged in. Please use `dimcli.login(key, endpoint)` before querying.")

    def _refresh_login(self):
        if self._CONNECTION:
            self._CONNECTION.refresh_login()
            self._url = self._CONNECTION.url
            self._headers = {'Authorization': "JWT " + self._CONNECTION.token}
            self.verify_ssl = self._CONNECTION.verify_ssl
        else:
            printDebug("Warning: please login first.")

[docs]    def query(self, q, show_results=None, retry=0, verbose=None):
        """Execute a single DSL query.

        This method handles the query token from the API and regenerates it if it's expired. If the API throws a 'Too Many Requests for the Server' error, the method sleeps 30 seconds before retrying.

        Parameters
        ----------
        show_results : bool, default=None
            Setting that determines whether the query JSON results should be printed out. If None, it inherits from the Dsl global setting. Note that in Jupyter environments this is not needed, because iPython rich widgets are used by default.
        retry : int, default=0
            Number of times to retry the query if it fails.
        verbose : bool, default=None
            Verbose mode. If None, it inherits from the Dsl global setting. 


        Returns
        -------
        DslDataset
            A Dimcli wrapper object containing JSON data. 

        Example
        -------------
        >>> dsl = dimcli.Dsl()
        >>> dsl.query("search grants where start_year=2020 return grants")
        <dimcli.dimensions.DslDataset object>

        """
        if not self.is_logged_in:
            self._print_please_login()
            return False

        if verbose == None:
            verbose = self._verbose
        
        #   Execute DSL query.
        start = time.time()
        response = requests.post(self._url, data=q.encode(), headers=self._headers, verify=self.verify_ssl)
        if response.status_code == 429:  
            # Too Many Requests
            printDebug(
                'Too Many Requests for the Server. Sleeping for 30 seconds and then retrying.'
            )
            time.sleep(30)
            return self.query(q, show_results, retry, verbose)
        elif response.status_code == 403:  
            # Forbidden:
            printDebug('Login token expired. Logging in again.')
            self._refresh_login()
            # self._CONNECTION.refresh_login()
            # self._url = self._CONNECTION.url
            # self._headers = {'Authorization': "JWT " + self._CONNECTION.token}
            return self.query(q, show_results, retry, verbose)
        elif response.status_code in [200, 400, 500]:  
            ###  
            # OK or Error Info :-)
            ###
            try:
                res_json = response.json()
            except:
                printDebug('Unexpected error. JSON could not be parsed.')
                return response
            result = DslDataset(res_json)
            end = time.time()
            elapsed = end - start
            if verbose: print_json_stats(result, q, elapsed)
            print_json_errors(result) # ALWAYS print errors
            if verbose: print_json_warnings(result) # DON'T print warnings unless verbose=True
            if show_results or (show_results is None and self._show_results):
                IPython.display.display(result)
            return result
        else:
            if retry > 0:
                printDebug('Retrying in 30 secs')
                time.sleep(30)
                return self.query(
                    q,
                    show_results,
                    retry - 1, 
                    verbose)
            else:
                if verbose: printDebug("ERROR LOG\n---\nQuery\n---\n" + str(q), "red")
                if verbose: printDebug("Response.header\n---\n" + str(response.headers), "red")
                if verbose: printDebug("Response.content\n---\n" +str(response.content), "red")
                response.raise_for_status()



[docs]    def query_iterative(self, q, show_results=None, limit=1000, skip=0, pause=1.5, force=False, maxlimit=0, verbose=None, _tot_count_prev_query=0, _warnings_tot=None):       
        """Runs a DSL query and then keep querying until all matching records have been extracted. 
        
        The API returns a maximum of 1000 records per call. If a DSL query results in more than 1000 matches, it is possible to use pagination to get more results, up to 50k. 
        
        Iterative querying works by automatically paginating through all records available for a result set. The original query gets turned into a loop that uses the `limit` / `skip` operators until all the results available have been extracted. 

        NOTE If any of the iterative queries produce warning messages, these are aggregated and added to the `_warnings`section of the output data.
        
        Parameters
        ----------
        q: str 
            The DSL query. Important: pagination keywords eg `limit` / `skip` should be omitted.
        show_results : bool, default=True
            Determines whether the final results are rendered via the iPython display widget (for Jupyter notebooks).
        limit : int, default=1000
            How many records to extract per iteration. Defaults to 1000.
        skip : int, default=0
            Offset for first iteration. Defaults to 0. After the first iteration, this value is calculated dynamically.
        pause : float, default=1.5s
            How much time to pause after each iterarion, expressed in seconds. Defaults to 1.5. Note: each iteration gets timed, so the pause time is used only when the query time is more than 2s. 
        force : bool, default=False
            Continue the extraction even if one of the iterations fails due to an error. 
        maxlimit : int, default=0
            The maximum number of records to extract in total. If 0, all available records are extracted, up to the API upper limit of 50k records per query.
        verbose : bool, default=False
            Verbose mode.


        Returns
        -------
        DslDataset
            A Dimcli wrapper object containing JSON data. 

        Example
        -------
        >>> dsl = dimcli.Dsl()
        >>> dsl.query_iterative(\"\""search grants where category_for.name="0206 Quantum Physics" return grants"\"\")
        Starting iteration with limit=1000 skip=0 ...
        0-1000 / 8163 (4.062144994735718s)
        1000-2000 / 8163 (1.5146172046661377s)
        2000-3000 / 8163 (1.7225260734558105s)
        3000-4000 / 8163 (1.575329065322876s)
        4000-5000 / 8163 (1.521540880203247s)
        5000-6000 / 8163 (1.471721887588501s)
        6000-7000 / 8163 (1.5068159103393555s)
        7000-8000 / 8163 (1.4724757671356201s)
        8000-8163 / 8163 (0.7611980438232422s)
        ===
        Records extracted: 8163


        Returns
        -------
        DslDataset
            A Dimcli wrapper object containing JSON data. 

        """
        if not self.is_logged_in:
            self._print_please_login()
            return False

        if verbose == None:
            verbose = self._verbose

        if line_count_returns(q) != 1:
            raise Exception("Iterative queries support only 1 return statement")
        if line_has_limit_or_skip(q):
            raise Exception("Iterative queries should not contain the keywords `limit` or `skip`")
        sourcetype = line_search_return(q)  
        if not (sourcetype in G.sources()):
            raise Exception("Iterative queries can return only one of the Dimensions sources: %s" % ", ".join([s for s in G.sources()])) 

        IS_UNNEST = line_search_unnest(q)
        #
        # ensure we stop the loop at 50k **
        #
        MAXLIMIT = maxlimit or 50000
        flag_last_round = False
        if skip + limit >= MAXLIMIT:
            flag_last_round = True
            if skip + limit > MAXLIMIT:
                limit = MAXLIMIT - skip


        if not _tot_count_prev_query:
            # first iteration
            # if verbose: printDebug(f"{limit+skip} / ...")
            if verbose: printDebug(f"Starting iteration with limit={limit} skip={skip} ...")
            
        output, flag_force = [], False
        q2 = q + " limit %d skip %d" % (limit, skip)
        
        start = time.time()
        res = self.query(q2, show_results=False, retry=0, verbose=False)
        end = time.time()
        elapsed = end - start

        if (end - start) < 2:
            # printDebug("sleeping")
            time.sleep(pause)

        if res['errors'] and not force:
            printDebug(f"\n>>>[Dimcli tip] An error occurred with the batch '{skip}-{limit+skip}'. Consider using the 'limit' argument to retrieve fewer records per iteration, or use 'force=True' to ignore errors and continue the extraction.")
            return res
        elif res['errors'] and force:
            printDebug(f"\n>>>[Dimcli log] An error occurred with the batch '{skip}-{limit+skip}'. Skipping this batch and continuing iteration.. ")
            flag_force = True

        # RECURSION 

        try:
            tot = int(res['stats']['total_count'])
        except:
            tot =  _tot_count_prev_query # when force=True, we have no current query stats

        new_skip = skip+limit
        if tot > 0 and new_skip > tot:
            new_skip = tot
        if verbose and tot:  # if not first iteration
            t = "%.2f" % elapsed
            printDebug(f"{skip}-{new_skip} / {tot} ({t}s)")

        if res["_warnings"]:
            warnings = [f"""{x} (iteration: {skip}-{new_skip})""" for x in res["_warnings"]]
            if _warnings_tot:
                _warnings_tot += warnings
            else:
                _warnings_tot = warnings

        if flag_force:
            output = self.query_iterative(q, show_results, limit, new_skip, pause, force, maxlimit, verbose, _tot_count_prev_query, _warnings_tot)                    

        elif not IS_UNNEST and len(res[sourcetype]) == limit and not flag_last_round:
            output = res[sourcetype] + self.query_iterative(q, show_results, limit, new_skip, pause, force,maxlimit, verbose, tot, _warnings_tot)

        elif IS_UNNEST and len(res[sourcetype]) > 0 and not flag_last_round:
            # unnest returns a number of records that don't relate to actual data left
            # hence can't match the lenght of results to limit in this case
            output = res[sourcetype] + self.query_iterative(q, show_results, limit, new_skip, pause, force, maxlimit, verbose, tot, _warnings_tot)

        else:
            output = res[sourcetype]

        # FINALLY 
        #
        # if recursion is complete (we are at top level, hence skip=0) 
        #   build the DslDataset obj
        # else 
        #   just return current iteration results 
        #
        if skip == 0: 
            response_simulation = {
                "_stats": {
                    "total_count": tot or len(output)  # fallback..
                    },
                sourcetype: output
            }
            if _warnings_tot:
                response_simulation["_warnings"] = _warnings_tot
            result = DslDataset(response_simulation)
            if show_results or (show_results is None and self._show_results):
                IPython.display.display(result)
            if verbose: 
                printDebug(f"===\nRecords extracted: {len(output)}")
                if _warnings_tot: 
                    printDebug(f"Warnings:  {len(_warnings_tot)}")
            return result
        else:
            return output


    def __repr__(self):
        return f"<dimcli.Dsl #{id(self)}. API endpoint: {self._url}>"



        

[docs]class DslDataset(IPython.display.JSON):
    """Wrapper for JSON results from DSL.

    This object makes it easier to process, save and load API JSON data. 

    Example
    ----------
    >>> dsl = dimcli.Dsl()
    >>> data = dsl.query(\"\""search publications for "machine learning" return publications limit 100"\"\")
    Returned Publications: 20 (total = 2501114)
    Time: 1.36s
    >>> print(data)
    <dimcli.DslDataset object #4383191536. Records: 100/2501114>
    >>> len(data)
    100
    >>> data.count_batch
    100
    >>> data.count_total
    2501114
    >>> data.json
    #  => returns the underlying JSON data
    >>> data['publications'] 
    #  => shortcut for the 'publications' key in the underlying JSON data
    >>> data.publications
    #  => ..this is valid too!

    """

[docs]    @classmethod
    def from_publications_list(cls, data):
        """Utility method that allows to simulate an API results DslDataset object from raw publications data. 
        
        This functionality can be used to reload data that was cached locally, or to combine the merged results of separate API queries into a single DslDataset object. 

        Once created, the DslDataset object has the same exact behaviour as when it is obtained from an API query (so one can take advatange of dataframe creation methods, for example).

        Parameters
        ----------
        data: list or pandas dataframe 
            A list of publications, in the form of either a list of dictionaries, or as a pandas dataframe. 


        Returns
        -------
        DslDataset
            A Dimcli wrapper object containing JSON data. 

        Example
        ----------
        >>> dsl = dimcli.Dsl()
        >>> rawdata = dsl.query("search publications return publications").publications
        >>> type(rawdata)
        list
        >>> newDataset = dimcli.DslDataset.from_publications_list(rawdata)
        >>> newDataset
        <dimcli.DslDataset object #4767014816. Records: 20/20>
        """
        return cls._from_any_list(data, "publications")
    
[docs]    @classmethod
    def from_grants_list(cls, data):
        """Utility method that allows to simulate an API results DslDataset object from raw grants data. See the `from_publications_list` method for more information. 

        Parameters
        ----------
        data: list or pandas dataframe 
            A grants list (using the API DSL structure), in the form of either a list of dictionaries, or as a pandas dataframe. 

        Returns
        -------
        DslDataset
            A Dimcli wrapper object containing JSON data.         
        
        """
        return cls._from_any_list(data, "grants")
    
[docs]    @classmethod
    def from_researchers_list(cls, data):
        """Utility method that allows to simulate an API results DslDataset object from raw researchers data. See the `from_publications_list` method for more information. 

        Parameters
        ----------
        data: list or pandas dataframe 
            A researchers list (using the API DSL structure), in the form of either a list of dictionaries, or as a pandas dataframe. 

        Returns
        -------
        DslDataset
            A Dimcli wrapper object containing JSON data.         
        
        """
        return cls._from_any_list(data, "researchers")
    
[docs]    @classmethod
    def from_clinical_trials_list(cls, data):
        """Utility method that allows to simulate an API results DslDataset object from raw clinical_trials data. See the `from_publications_list` method for more information. 

        Parameters
        ----------
        data: list or pandas dataframe 
            A clinical_trials list (using the API DSL structure), in the form of either a list of dictionaries, or as a pandas dataframe. 

        Returns
        -------
        DslDataset
            A Dimcli wrapper object containing JSON data.         
        
        """
        return cls._from_any_list(data, "clinical_trials")
    
[docs]    @classmethod
    def from_patents_list(cls, data):
        """Utility method that allows to simulate an API results DslDataset object from raw patents data. See the `from_publications_list` method for more information. 

        Parameters
        ----------
        data: list or pandas dataframe 
            A patents list (using the API DSL structure), in the form of either a list of dictionaries, or as a pandas dataframe. 

        Returns
        -------
        DslDataset
            A Dimcli wrapper object containing JSON data.         
        
        """
        return cls._from_any_list(data, "patents")
    
[docs]    @classmethod
    def from_policy_documents_list(cls, data):
        """Utility method that allows to simulate an API results DslDataset object from raw policy_documents data. See the `from_publications_list` method for more information. 

        Parameters
        ----------
        data: list or pandas dataframe 
            A policy_documents list (using the API DSL structure), in the form of either a list of dictionaries, or as a pandas dataframe. 

        Returns
        -------
        DslDataset
            A Dimcli wrapper object containing JSON data.         
        
        """
        return cls._from_any_list(data, "policy_documents")
    
[docs]    @classmethod
    def from_organizations_list(cls, data):
        """Utility method that allows to simulate an API results DslDataset object from raw organizations data. See the `from_publications_list` method for more information. 

        Parameters
        ----------
        data: list or pandas dataframe 
            An organizations list (using the API DSL structure), in the form of either a list of dictionaries, or as a pandas dataframe. 

        Returns
        -------
        DslDataset
            A Dimcli wrapper object containing JSON data.         
        
        """
        return cls._from_any_list(data, "organizations")
    
    @classmethod
    def _from_any_list(cls, data, source_type):
        """Generic method that allows to simulate an API results DslDataset object from raw data.
        """
        if type(data) == list:
            return cls({source_type : data, '_stats' : {'total_count' : len(data)}})
        elif type(data) == pd.DataFrame:
            jsondata = json.loads(data.to_json(orient="records"))
            return cls({source_type : jsondata, '_stats' : {'total_count' : len(jsondata)}})
        else:
            raise ValueError('Invalid data format. Must be either a dict list, or a pandas dataframe')
    
[docs]    @classmethod
    def load_json_file(cls, filename, verbose=False):
        """Load a file containing DSL JSON data and returns a valid DslDataset object. 

        Note: this is normally used in combination with the `to_json_file` method.

        Parameters
        ----------
        filename: str 
            A valid filename (including path if necessary) that contains the JSON data.

        Returns
        -------
        DslDataset
            A Dimcli wrapper object containing JSON data.         

        Example
        -------
        Save the results of a query to a JSON file, then reload the same file and create a new dataset.

        >>> dataset = dsl.query(\"\""search publications where journal.title="nature medicine" return publications[id+title+year+concepts] limit 100"\"\")
        Returned Publications: 100 (total = 12641)

        Save the data to a local json file

        >>> FILENAME = "test-api-save.json"
        >>> dataset.to_json_file(FILENAME, verbose=True)
        Saved to file:  test-api-save.json
        
        Create a new DslDataset object by loading the contents of the JSON file.

        >>> new_dataset = DslDataset.load_json_file(FILENAME, verbose=True)
        Loaded file:  test-api-save.json
        >>> print(new_dataset)
        <dimcli.DslDataset object #4370267824. Records: 100/12641>

        """
        with open(filename) as json_file:
            jsondata = json.load(json_file)
        if verbose: printDebug("Loaded file: ", filename)
        return cls(jsondata)


    def __init__(self, data):
        IPython.display.JSON.__init__(self, data)
        self.json = self.data
        self.errors = None
        for k in self.json.keys(): # add result dict keys as attributes dynamically
            if k == "_stats":
                setattr(self, "stats", self.json[k])
            else:
                setattr(self, k, self.json[k])

        self.df_factory = DfFactory(good_data_keys=self.good_data_keys())

    def __getitem__(self, key):
        "Trick to return any dict key as a property"
        # printDebug(key, "==========")
        if key == "stats":
            key = "_stats" # syntactic sugar
        if key in self.json:
            return self.json[key]
        else:
            return [] # empty list so to support iteration tests / previously: False

    def __len__(self):
        "Return length of first object in JSON"
        k = self.good_data_keys()
        try:
            return len(self.json[k[0]])
        except:
            return 0

[docs]    def good_data_keys(self,):
        """Utility that returns the 'data' keys of the inner JSON object, excluding metadata like 'stats', 'warnings' and 'version' info. 

        Returns
        -------
        list
            A list of dictionary keys.         

        Example
        -------
        >>> queryresults.good_data_keys()
        ['publications']

        """
        skips = ["_warnings", "_notes", "_stats", "_version", "_copyright"]
        return [x for x in self.json.keys() if x not in skips]

[docs]    def keys_and_count(self,):
        """Utility that previews the contents of the inner JSON object. 

        Returns
        -------
        list
            A list of tuples.       

        Example
        -------
        >>> queryresults.keys_and_count()
        [('_stats', 3), ('_warnings', 1), ('_version', 2), ('publications', 100)]

        """
        return [(x, len(self.json[x])) for x in self.json.keys()]

    @property
    def count_total(self,):
        """Total number of results in Dimensions for the query (as opposed to the results returned in the JSON payload). 

        Returns
        -------
        int
            The number of results
        """
        if self.json.get("_stats"):
            return self.json['_stats']['total_count']
        else:
            return None

    @property
    def count_batch(self,):
        """Number of results returned from the query. 

        Returns
        -------
        int
            The number of results
        """
        return len(self)

    @property
    def errors_string(self,):  # can't be called 'error' due to conflict with auto-set field
        """Utility that merges all errors messages into a single string."""
        if self.json.get("errors"):
            return self.json['errors']['query']['header'] + self.json['errors']['query']['details'][0]
        else:
            return ""

[docs]    def chunks(self, size=400, key=""):
        """Return an iterator for going through chunks of the JSON results. 

        Note: in DSL queries with multiple `return` statements it is better to specify which result-type needs to be chunked using the `key` parameter. 

        Parameters
        ----------
        size: int, default=400 
            Number of objects (records) to include in each chunk.
        key: str, optional
            The JSON results data object that needs to be chunked eg 'publications' or 'grants'. If not specified, the first available dict key is used. 

        Returns
        -------
        iterator
            A iterator object         

        Example
        -------
        Break up a 1000 records dataset into groups of 100.

        >>> data = dslquery("search publications return publications limit 1000")
        >>> groups = [len(x) for x in data.chunks(size=100)]

        """

        if not key:
            if len(self.good_data_keys()) > 1:
                printDebug(f"Please specify a key from {self.good_data_keys()}")
                return
            else:
                key = self.good_data_keys()[0]
        elif key not in self.good_data_keys():
            printDebug(f"Invalid key: should be one of {self.good_data_keys()}")
            return 

        it = iter(self.json[key])
        chunk = list(islice(it, size))
        while chunk:
            yield chunk
            chunk = list(islice(it, size))

    # Dataframe Methods 

[docs]    def as_dataframe(self, key="", links=False, nice=False):
        """Return the JSON data as a Pandas DataFrame. 

        If `key` is empty, the first available JSON key (eg 'publications') is used to determine
        what JSON data should be turned into a dataframe (mostly relevant when using multi-result DSL queries).

        Parameters
        ----------
        key: str, optional
            The JSON results data object that needs to be processed.
        links: bool, optional 
            Tranform suitable fields to hyperlinks. Default: False.
        nice: bool, optional 
            Reformat column names and complex values where possible. Useful for visual inspection and printing our. Default: False.

        Returns
        -------
        pandas.DataFrame
            A DataFrame instance containing API records.   

        Example
        -------
        See https://api-lab.dimensions.ai/cookbooks/1-getting-started/3-Working-with-dataframes.html        

        """

        if not self.json.get("errors"):
            return self.df_factory.df_simple(self.json, key, links, nice)


[docs]    def as_dataframe_authors(self, links=False):
        """Return the JSON data as a Pandas DataFrame, in which each row corresponds to a publication author.

        This method works only with 'publications' queries and it's clever enough to know if the `authors` or `author_affiliations` (deprecated) fields are used. The list of affiliations per each author are not broken down and are returned as JSON. So in essence you get one row per author.

        Returns
        -------
        pandas.DataFrame
            A DataFrame instance containing API records.          

        Example
        -------
        See https://api-lab.dimensions.ai/cookbooks/1-getting-started/3-Working-with-dataframes.html  
        """
        if not self.json.get("errors"):
            return self.df_factory.df_authors(self.json, links)


[docs]    def as_dataframe_authors_affiliations(self, links=False):
        """Return the JSON data as a Pandas DataFrame, in which each row corresponds to a publication affiliation.

        This method works only with 'publications' queries and it's clever enough to know if the `authors` or `author_affiliations` (deprecated) fields are used. If an author has multiple affiliations, they would be represented in different rows (hence the same authors may appear on different rows). 

        Returns
        -------
        pandas.DataFrame
            A DataFrame instance containing API records.          

        Example
        -------
        See https://api-lab.dimensions.ai/cookbooks/1-getting-started/3-Working-with-dataframes.html  
        """
        if not self.json.get("errors"):
            return self.df_factory.df_authors_affiliations(self.json, links)

[docs]    def as_dataframe_concepts(self, key="", links=False):
        """Return the JSON data as a Pandas DataFrame, in which each row corresponds to a single 'concept'.

        This method works only with 'publications' and 'grants' queries and it's clever enough to know if the `concepts` or `concepts_scores` fields are used. Additional metrics like 'frequency' and 'score_average' are also included in the results. 

        Returns
        -------
        pandas.DataFrame
            A DataFrame instance containing API records.          

        Example
        -------
        See https://api-lab.dimensions.ai/cookbooks/1-getting-started/3-Working-with-dataframes.html  
        """
        if not self.json.get("errors"):
            return self.df_factory.df_concepts(self.json, key, links)


[docs]    def as_dataframe_funders(self, links=False):
        """Return the JSON data as a Pandas DataFrame, in which each row corresponds to a single 'funder'.

        This method works only with 'grants' queries.

        Returns
        -------
        pandas.DataFrame
            A DataFrame instance containing API records.          

        Example
        -------
        See https://api-lab.dimensions.ai/cookbooks/1-getting-started/3-Working-with-dataframes.html  
        """
        if not self.json.get("errors"):
            return self.df_factory.df_grant_funders(self.json, links)

[docs]    def as_dataframe_investigators(self, links=False):
        """Return the JSON data as a Pandas DataFrame, in which each row corresponds to a single 'investigator'.

        This method works only with 'grants' queries.

        Returns
        -------
        pandas.DataFrame
            A DataFrame instance containing API records.          

        Example
        -------
        See https://api-lab.dimensions.ai/cookbooks/1-getting-started/3-Working-with-dataframes.html  
        """
        if not self.json.get("errors"):
            return self.df_factory.df_grant_investigators(self.json, links)

[docs]    def as_dimensions_url(self, records=500, verbose=True):
        """Utility that turns a list of records into a Dimensions webapp URL, by using the record IDs as filters.
        
        NOTE: this functionality is EXPERIMENTAL and may break or be removed in future versions. Also, it works only with: publications, grants, patents, clinical_trials, policy_documents.

        Parameters
        ----------
        records: int, default=500 
            The number of record IDs to use. With more than 500, it is likely to incur into a '414 Request-URI Too Large' error. 
        verbose: bool, default=True
            Verbose mode

        Returns
        -------
        str
            A string representing a Dimensions URL.  

        Example
        -------

        >>> data = dsl.query(\"\""search publications where id in ["pub.1120715293", "pub.1120975084", "pub1122068834", "pub.1120602308"] return publications\"\"")
        >>> data.as_dimensions_url()
        'https://app.dimensions.ai/discover/publication?search_text=id%3A+%28pub.1120975084+OR+pub.1120715293+OR+pub.1120602308%29'
        """
        
        if verbose: printDebug("Warning: this is an experimental and unsupported feature.")


        # General query structure for IDs: 
           
        # `id: (pub.1120715293 OR pub.112097508 4 OR pub.1122068834 OR pub.1120602308)`

        # Final URL looks like this https://app.dimensions.ai/discover/publication?search_text=id%3A+%28pub.1120715293+OR+pub.1120975084+OR+pub.1122068834+OR+pub.1120602308%29

        # hardcoded
        supported_url_templates = {
            'publications' : "https://app.dimensions.ai/discover/publication?search_text=",
            'grants' : "https://app.dimensions.ai/discover/grant?search_text=",
            'patents' : "https://app.dimensions.ai/discover/patent?search_text=",
            'clinical_trials' : "https://app.dimensions.ai/discover/clinical_trial?search_text=",
            'policy_documents' : "https://app.dimensions.ai/discover/policy_document?search_text=",
        }

        # just return first valid source found in results
        ids = []
        for sourcetype in supported_url_templates:
            if sourcetype in self.good_data_keys():
                try:
                    ids = [x['id'] for x in self.json[sourcetype]]
                    q = " OR ".join(ids)
                    if sourcetype == "grants":
                        q =  "grant_id: (" + q + ")"
                    else:
                        q =  "id: (" + q + ")"
                    q =  urllib.parse.quote_plus(q)
                    return supported_url_templates[sourcetype] + q
                except:
                    raise Exception("DslDataset records do not contain a valid ID field.")
        return None


[docs]    def to_json_file(self, filename="", verbose=True):
        """Export API results data to a JSON file. 

        Note: this is normally used in combination with the `load_json_file` method.

        Parameters
        ----------
        filename: str, optional 
            A filename/path where to save the data. If not provided, a unique name is generated automatically.

        Returns
        -------
        str
            The string representation of the filename the data is saved to.         

        Example
        -------
        Save the results of a query to a JSON file, then reload the same file and create a new dataset.

        >>> dataset = dsl.query(\"\""search publications where journal.title="nature medicine" return publications[id+title+year+concepts] limit 100"\"\")
        Returned Publications: 100 (total = 12641)

        Save the data to a local json file

        >>> FILENAME = "test-api-save.json"
        >>> dataset.to_json_file(FILENAME, verbose=True)
        Saved to file:  test-api-save.json
        
        Data can be reloaded from file, using the `load_json_file` class method. 

        >>> new_dataset = DslDataset.load_json_file(FILENAME, verbose=True)
        Loaded file:  test-api-save.json
        >>> print(new_dataset)
        <dimcli.DslDataset object #4370267824. Records: 100/12641>

        """
        if not self.json.get("errors"):
            if not filename:
                filename = time.strftime(f"dimensions_data_%Y-%m-%d_%H-%M-%S.json")
            with open(filename, 'w') as outfile:
                json.dump(self.json, outfile)
                if verbose: printDebug("Saved to file: ", filename)
                return filename




[docs]    def to_gsheets(self, title=None, verbose=True):
        """Export the dataframe version of some API results to a public google sheet. Google OAUTH client credentials are a prerequisite for this method to work correctly. 

        Parameters
        ----------
        title: str, optional 
            The spreadsheet title, if one wants to reuse an existing spreadsheet.
        verbose: bool, default=True
            Verbose mode

        Notes
        -----
        This method assumes that the calling environment can provide valid Google authentication credentials.
        There are two routes to make this work, depending on whether one is using Google Colab or a traditional Jupyter environment.

        **Google Colab**
        This is the easiest route. In Google Colab, all required libraries are already available. The `to_gsheets` method simply triggers the built-in authentication process via a pop up window. 
        
        **Jupyter**
        This route involves a few more steps. In Jupyter, it is necessary to install the gspread, oauth2client and gspread_dataframe modules first. Secondly, one needs to create Google Drive access credentials using OAUTH (which boils down to a JSON file). Note that the credentials file needs to be saved in: `~/.config/gspread/credentials.json` (for gpread). 
        The steps are described at https://gspread.readthedocs.io/en/latest/oauth2.html#for-end-users-using-oauth-client-id.

        Returns
        -------
        str
            The google sheet URL as a string.   

        """
        if self.json.get("errors"):
            return None

        df = self.as_dataframe()

        return export_as_gsheets(df,  title=title, verbose=verbose)        
             
    

    def __repr__(self):
        if self.json.get("errors"):
            return "<dimcli.DslDataset object #%s. Errors: %d>" % (str(id(self)), len(self.json['errors']))
        else:
            try:
                return "<dimcli.DslDataset object #%s. Records: %d/%d>" % (str(id(self)), self.count_batch, self.count_total)
            except:
                # non-search queries
                return "<dimcli.DslDataset object #%s. Dict keys: %s>" % (str(id(self)), ", ".join([f"'{x}'" for x in self.json]))





# 2019-12-17: for backward compatibility
# remove once all notebooks code has been updated 
Result = DslDataset
Dataset = DslDataset