Source code for dimcli.utils.converters

from .misc_utils import printDebug
from .dim_utils import dimensions_url
from collections import OrderedDict


# ===========
# TESTING - UNSUPPORTED FEATURE
# ===========
#

[docs]class DslDataConverter():
	"""
	Helper class containing methods for transforming JSON complex snippets to other formats.
	Useful eg for creating a nice looking CSV from raw API data.

	Status: ALPHA - UNSUPPORTED FEATURE

	Converters subclasses available only for
	* Pubs
	* Grants
	* Clinical Trials
	* Datasets 
	* Patents
	
	To Review:
	* Organizations
	* Policy Documents
	* Researchers


	Example
	========
	>>> from dimcli.utils.converters import *
	>>> df_temp = dsl.query_iterative("search publications return publications").as_dataframe()
	>>> c1 = DslDatasetsConverter(df_temp)
	>>> df_final = c1.run()

	ALGORITHM
	==========
	# iterate through all keys/columns in dataframe
	#
	# if column name == key in fields_mappings:
	#   apply all functions => generate new columns
	#   remove old column 
	# else if column value is list 
	#   break down list into semicolon delimited string
	#   replace old column
	#
	#  PS dimensions_url special case, we just add a new column without removing 'ID'
	#   also, it applies only to sources

	"""

	def __init__(self, df, object_type="", verbose=False):

		self.df_original = df
		self.df_modified = None

		self.object_type = object_type
		# self.df_converted = df.copy()
		self.keep_extra_cols = False
		self.verbose = verbose

		self.columns_original = self.df_original.columns.to_list()
		if self.verbose: printDebug("Original columns are:", self.columns_original)

		# defined when subclassing
		self.column_transformations = OrderedDict()


[docs]	def run(self, keep_extra_cols=True):
		"""@TODO define a suitable abstraction for automatic transformation
		eg simplify all fields to strings
		"""
		self.keep_extra_cols = keep_extra_cols
		self.apply_transformations()
		self.sort_and_prune()
		return self.df_modified


[docs]	def extend_transformations(self):
		"""Add default transformations for all fields found in a df (not just the ones defined explicitly) using standard rules (camel case and spacing). """
		for c in self.columns_original:
			if c not in [x[0] for x in self.column_transformations.values()]:
				new_c = c.replace("_", " ").title()
				self.column_transformations[new_c] = (c, '')



	#
	#
	# Helpers METHODS
	#
	#

[docs]	def apply_transformations(self):
		"""For each column, see if there is a transformation defined, and apply it.

		keep_extra_cols:
			bool, True
			Columns not included in the transformation rules are included by default.
		
		"""
		if self.verbose: printDebug("Applying transformations..")
		
		df = self.df_original.copy()

		if self.keep_extra_cols:
			self.extend_transformations()
		
		if self.column_transformations:
			for new_col, details in self.column_transformations.items():
				source, action, arg = details[0], details[1], None
				if len(details)>2:
					arg = details[2]
				if source in self.columns_original:
					if self.verbose: printDebug(f"...converting '{source}' to '{new_col}'")
					if action:
						function = getattr(self, action)
						if arg:
							df[new_col] = df[source].fillna("").apply(lambda cell: function(cell, arg))
						else:
							df[new_col] = df[source].fillna("").apply(lambda cell: function(cell))
					else:
						df[new_col] = df[source]

		df.fillna('', inplace=True)
		# finally:
		self.df_modified = df


[docs]	def sort_and_prune(self, new_cols_ordered_list=None):
		"""generate a default order if not provided, keeping only those cols
		"""

		if self.verbose: printDebug("Sorting / dropping columns...")

		if new_cols_ordered_list and type(new_cols_ordered_list) == list:
			# user-provided list of cols
			self.df_modified = self.df_modified[new_cols_ordered_list]
		else:
			# infer from all declared columns
			new_cols_ordered_list = []
			existing_cols = self.df_modified.columns.to_list()

			for new_col in self.column_transformations:
				# PS ensure declared cols actually exist!!
				if new_col in existing_cols:
					new_cols_ordered_list.append(new_col)

		if new_cols_ordered_list:
			self.df_modified = self.df_modified[new_cols_ordered_list]
		
		return self.df_modified



[docs]	def truncate_for_gsheets(self, cols_subset=None):
		"""
		helper to avoid gsheets error
		'Your input contains more than the maximum of 50000 characters in a single cell.'

		cols_subset: eg ['Abstract', 'Authors', 'Authors Affiliations']

		"""

		if self.verbose: printDebug("Truncating strings longer than 50k chars...")

		def helper(s):
			# printDebug(len(str(s)))
			# n = str(s)[:49500]
			# printDebug(len(str(n)))
			if len(str(s)) > 49500:
				n = str(s)[:49500]
				return n + "..."
			return s

		if cols_subset:
			for col in cols_subset:
				self.df_modified[col] = self.df_modified[col].apply(lambda x: helper(x))

	#
	#
	# CONVERSION METHODS
	#
	#

[docs]	def convert_id_to_url(self, idd, ttype=None):
		"""
		"""
		if ttype:
			return dimensions_url(idd, ttype)
		else:
			return dimensions_url(idd)


[docs]	def convert_authors_to_names(self, authorslist):
		"""
		"""
		authors = []
		for x in authorslist:
			name = x.get('last_name', "") + ", "+ x.get('first_name', "") 
			authors.append(name)
		return "; ".join(authors)


	def convert_authors_corresponding(self, authorslist):
		authors = []
		for x in authorslist:
			if x.get("corresponding", ""):
				name = x.get('last_name', "") + ", "+ x.get('first_name', "")
				authors.append(name)
		return "; ".join(authors)        

[docs]	def convert_authors_affiliations(self, authorslist):
		"""
		"""
		author_affiliations = []
		for x in authorslist:
			name = x.get('last_name', "") + ", "+ x.get('first_name', "")
			affiliations = "; ".join([a.get('name', "") for a in x['affiliations']])
			author_affiliations.append(f"{name} ({affiliations})")
		return "; ".join(author_affiliations)

[docs]	def convert_authors_grids(self, authorslist):
		"""
		"""
		grids = []
		for x in authorslist:
			grids += [a.get('id') for a in x['affiliations'] if a.get('id', None)]
		return "; ".join(list(set(grids)))

[docs]	def convert_authors_countries(self, authorslist):
		"""
		"""
		countries = []
		for x in authorslist:
			countries += [a.get('country') for a in x['affiliations'] if a.get('country', None)]
		return "; ".join(list(set(countries)))

[docs]	def convert_investigators_cltrials(self, investigatorslist):
		"""
		From: 
		[['Chaoqian Li', '', 'Study leader', '6 Shuangyong Road, Nanning, Guangxi Zhuang Autonomous Region, China', '', ''], ['Jianlin Huang', '', 'Applicant', "Beihai People's Hospital", "Beihai People's Hospital", 'grid.452719.c']]
		To
		"Chaoqian Li; Jianlin Huang"
		"""
		return "; ".join([x[0] for x in investigatorslist]) 

	def convert_list(self, data):
		return "; ".join([str(x) for x in data])

	def convert_dict_name(self, data):
		return "; ".join([y['name'] for y in data])

	def convert_dict_ids(self, data):
		return "; ".join([y['id'] for y in data])

	def convert_city_name(self, data):
		return "; ".join([y['city_name'] for y in data])

	def convert_state_name(self, data):
		return "; ".join([y['city_name'] for y in data])

	def convert_country_name(self, data):
		return "; ".join([y['country_name'] for y in data])

[docs]	def convert_interventions_dict(self, data):
		"""
		Return 'name' and 'type' for clinical trials / interventions 

		From: "[{'arm_group_labels': 'Hydroxychloroquine and conventional treatments', 'type': 'Drug', 'description': 'Subjects take hydroxychloroquine 400 mg per day for 5 days, also take conventional treatments', 'other_names': '', 'name': 'Hydroxychloroquine'}]"
		
		To: "Hydroxychloroquine (Drug)"
		"""
		return "; ".join([f"{x.get('name', '')} ({x.get('type', '')})" for x in data])

	def convert_float_to_integer(self, data):
		try:
			return int(data)
		except:
			return data


[docs]	def convert_abstract_to_preview(self, abstract):
		"""
		"""
		if abstract:
			return ""
			# May 22, 2020
			# return " ".join(abstract.split()[:20]) + "..."
		else:
			return ""




[docs]class DslPubsConverter(DslDataConverter):
	"""
	"""

	def __init__(self, df, verbose=False):

		super().__init__(df, "publications", verbose)

		## OVERRRIDE VALUES

		self.column_transformations = OrderedDict({
			# ('new_col_name', 'fun_name')
			'Date added' : ('date_inserted', ''), 
			'Publication ID' : ('id', ''), 
			'DOI' : ('doi', ''), 
			'PMID' : ('pmid', ''), 
			'PMCID' : ('pmcid', ''), 
			'Title' : ('title', ''), 
			'Abstract' : ('abstract', ''), 
			'Source title' : ('journal.title', ''), 
			'Source ID' : ('journal.id', ''), 
			'Publisher' : ('publisher', ''), 
			'MeSH terms' : ('mesh_terms', 'convert_list'), 
			'Publication Date' : ('date', ''), 
			'PubYear' : ('year', ''), 
			'Volume' : ('volume', ''), 
			'Issue' : ('issue', ''), 
			'Pagination' : ('pages', ''), 
			'Open Access' : ('open_access_categories', 'convert_dict_name'), 
			'Publication Type' : ('type', ''), 
			'Authors' : ('authors', 'convert_authors_to_names'), 
			# 'Corresponding Authors' : ('authors', 'convert_authors_corresponding'), 
			'Authors Affiliations' : ('authors', 'convert_authors_affiliations'), 
			'GRID IDs' : ('authors', 'convert_authors_grids'), 
			'Countries' : ('authors', 'convert_authors_countries'), 
			'Research Organizations - standardized' : ('research_orgs', 'convert_dict_name'), 
			# 'GRID IDs' : ('research_orgs', 'convert_dict_ids'),  # alternate method on different field
			'City of Research organization' : ('research_orgs', 'convert_city_name'), 
			# 'State of Research organization' : ('research_orgs', 'convert_state_name'), 
			'Country of Research organization' : ('research_orgs', 'convert_country_name'), 
			'Funder' : ('funders', 'convert_dict_name'), 
			'UIDs of supporting grants' : ('supporting_grant_ids', 'convert_list'), 
			# TODO Supporting Grants (proj number?)
			'Times cited' : ('times_cited', 'convert_float_to_integer'), 
			'Altmetric' : ('altmetric', 'convert_float_to_integer'), 
			'Source Linkout' : ('linkout', ''), 
			'Concepts' : ('concepts', 'convert_list'), 
			'Dimensions URL' : ('id', 'convert_id_to_url'), 
			'FOR (ANZSRC) Categories' : ('category_for', 'convert_dict_name'), 
			'RCDC Categories' : ('category_rcdc', 'convert_dict_name'), 
			'HRCS HC Categories' : ('category_hrcs_hc', 'convert_dict_name'), 
			'HRCS RAC Categories' : ('category_hrcs_rac', 'convert_dict_name'), 
			'ICRP Cancer Types' : ('category_icrp_ct', 'convert_dict_name'), 
			'ICRP CSO Categories' : ('category_icrp_cso', 'convert_dict_name'), 
			'BRA Categories' : ('category_bra', 'convert_dict_name'), 
			'HRA Categories' : ('category_hra', 'convert_dict_name'), 
			'SDG Categories' : ('category_sdg', 'convert_dict_name'), 
		})







[docs]class DslGrantsConverter(DslDataConverter):
	"""
	"""

	def __init__(self, df, verbose=False):

		super().__init__(df, "grants", verbose)

		self.column_transformations = OrderedDict({
			'Date added' : ('date_inserted', ''), 
			'Grant ID' : ('id', ''), 
			'Title' : ('title', ''), 
			'Abstract' : ('abstract', ''), 
			'Start date' : ('start_date', ''), 
			'End date' : ('end_date', ''), 
			'Funders' : ('funders', 'convert_dict_name'), 
			'Funders GRID IDs' : ('funders', 'convert_dict_ids'), 
			'Funders country' : ('funders', 'convert_country_name'), 
			'Research organizations' : ('research_orgs', 'convert_dict_name'), 
			'GRID IDs' : ('research_orgs', 'convert_dict_ids'), 
			'Research organizations countries' : ('research_orgs', 'convert_country_name'), 
			'Source linkout' : ('linkout', ''), 
			'Dimensions URL' : ('id', 'convert_id_to_url', 'grants'), 
			'Concepts' : ('concepts', 'convert_list'), 
			'FOR (ANZSRC) Categories' : ('category_for', 'convert_dict_name'), 
			'RCDC Categories' : ('category_rcdc', 'convert_dict_name'), 
			'HRCS HC Categories' : ('category_hrcs_hc', 'convert_dict_name'), 
			'HRCS RAC Categories' : ('category_hrcs_rac', 'convert_dict_name'), 
			'ICRP Cancer Types' : ('category_icrp_ct', 'convert_dict_name'), 
			'ICRP CSO Categories' : ('category_icrp_cso', 'convert_dict_name'), 
			'BRA Categories' : ('category_bra', 'convert_dict_name'), 
			'HRA Categories' : ('category_hra', 'convert_dict_name'), 
			'SDG Categories' : ('category_sdg', 'convert_dict_name'), 
		})




[docs]class DslPatentsConverter(DslDataConverter):
	"""
	"""

	def __init__(self, df, verbose=False):

		super().__init__(df, "grants", verbose)

		self.column_transformations = OrderedDict({
			'Date added' : ('date_inserted', ''), 
			'Patent ID' : ('id', ''), 
			'Title' : ('title', ''), 
			'Abstract' : ('abstract', ''), 
			'Date' : ('date', ''), 
			'Funders' : ('funders', 'convert_dict_name'), 
			'Funders GRID IDs' : ('funders', 'convert_dict_ids'), 
			'Funders countries' : ('funders', 'convert_country_name'), 
			'Assignees' : ('assignees', 'convert_dict_name'), 
			'Assignees GRID IDs' : ('assignees', 'convert_dict_ids'), 
			'Assignees countries' : ('assignees', 'convert_country_name'), 
			'Source linkout' : ('linkout', ''), 
			'Dimensions URL' : ('id', 'convert_id_to_url', 'patents'), 
			'Concepts' : ('concepts', 'convert_list'), 
			'FOR (ANZSRC) Categories' : ('category_for', 'convert_dict_name'), 
			'RCDC Categories' : ('category_rcdc', 'convert_dict_name'), 
			'HRCS HC Categories' : ('category_hrcs_hc', 'convert_dict_name'), 
			'HRCS RAC Categories' : ('category_hrcs_rac', 'convert_dict_name'), 
			'ICRP Cancer Types' : ('category_icrp_ct', 'convert_dict_name'), 
			'ICRP CSO Categories' : ('category_icrp_cso', 'convert_dict_name'), 
			'BRA Categories' : ('category_bra', 'convert_dict_name'), 
			'HRA Categories' : ('category_hra', 'convert_dict_name'), 
			'SDG Categories' : ('category_sdg', 'convert_dict_name'), 
		})





[docs]class DslDatasetsConverter(DslDataConverter):
	"""
	"""

	def __init__(self, df, verbose=False):

		super().__init__(df, "datasets", verbose)

		## OVERRRIDE VALUES

		self.column_transformations = OrderedDict({
			'Date added' : ('date_inserted', ''), 
			'Dataset ID' : ('id', ''), 
			'DOI' : ('doi', ''), 
			'Title' : ('title', ''), 
			'Description' : ('description', ''), 
			'Repository' : ('repository_id', ''), 
			'Publication year' : ('year', ''), 
			'Dataset author' : ('authors', 'convert_dict_name'), 
			'Associated publication' : ('associated_publication_id', ''), 
			'Funders' : ('funders', 'convert_dict_name'), 
			'Funders GRID IDs' : ('funders', 'convert_dict_ids'), 
			'Funders countries' : ('funders', 'convert_country_name'), 
			'Research organizations' : ('research_orgs', 'convert_dict_name'), 
			'GRID IDs' : ('research_orgs', 'convert_dict_ids'), 
			'Countries' : ('research_orgs', 'convert_country_name'), 
			'Source Linkout' : ('figshare_url', ''), 
			'Dimensions URL' : ('id', 'convert_id_to_url', 'datasets'), 
			'Concepts' : ('concepts', 'convert_list'), 
			'FOR (ANZSRC) Categories' : ('category_for', 'convert_dict_name'), 
			'RCDC Categories' : ('category_rcdc', 'convert_dict_name'), 
			'HRCS HC Categories' : ('category_hrcs_hc', 'convert_dict_name'), 
			'HRCS RAC Categories' : ('category_hrcs_rac', 'convert_dict_name'), 
			'ICRP Cancer Types' : ('category_icrp_ct', 'convert_dict_name'), 
			'ICRP CSO Categories' : ('category_icrp_cso', 'convert_dict_name'), 
			'BRA Categories' : ('category_bra', 'convert_dict_name'), 
			'HRA Categories' : ('category_hra', 'convert_dict_name'), 
			'SDG Categories' : ('category_sdg', 'convert_dict_name'), 
		})




[docs]class DslClinicaltrialsConverter(DslDataConverter):
	"""
	"""

	def __init__(self, df, verbose=False):

		super().__init__(df, "clinical_trials", verbose)

		self.column_transformations = OrderedDict({
			'Date added' : ('date_inserted', ''), 
			'Trial ID' : ('id', ''), 
			'Title' : ('title', ''), 
			'Brief title' : ('brief_title', ''), 
			'Acronym' : ('acronym', ''), 
			'Abstract' : ('abstract', ''), 
			'Publication date' : ('date', ''), 
			'Active years' : ('active_years', 'convert_list'), 
			'Phase' : ('phase', ''), 
			'Conditions' : ('conditions', 'convert_list'), 
			'Intervention' : ('interventions', 'convert_interventions_dict'), 
			'Gender' : ('gender', ''), 
			'Registry' : ('registry', ''), 
			'Investigators/Contacts' : ('investigators', 'convert_investigators_cltrials'), 
			'Sponsors/Collaborators' : ('research_orgs', 'convert_dict_name'), 
			'GRID IDs' : ('research_orgs', 'convert_dict_ids'), 
			'Country of Sponsor/Collaborator' : ('research_orgs', 'convert_country_name'), 
			'Collaborating Funders' : ('funders', 'convert_dict_name'), 
			'Funder Country' : ('funders', 'convert_country_name'), 
			'Source Linkout' : ('linkout', ''), 
			'Dimensions URL' : ('id', 'convert_id_to_url', 'clinical_trials'), 
			'Concepts' : ('concepts', 'convert_list'), 
			'FOR (ANZSRC) Categories' : ('category_for', 'convert_dict_name'), 
			'RCDC Categories' : ('category_rcdc', 'convert_dict_name'), 
			'HRCS HC Categories' : ('category_hrcs_hc', 'convert_dict_name'), 
			'HRCS RAC Categories' : ('category_hrcs_rac', 'convert_dict_name'), 
			'ICRP Cancer Types' : ('category_icrp_ct', 'convert_dict_name'), 
			'ICRP CSO Categories' : ('category_icrp_cso', 'convert_dict_name'), 
			'BRA Categories' : ('category_bra', 'convert_dict_name'), 
			'HRA Categories' : ('category_hra', 'convert_dict_name'), 
			'SDG Categories' : ('category_sdg', 'convert_dict_name'), 
		})







[docs]class DslPolicyDocumentsConverter(DslDataConverter):
	"""@TODO review
	"""

	def __init__(self, df, verbose=False):

		super().__init__(df, "grants", verbose)

		self.column_transformations = OrderedDict({
			'Date added' : ('date_inserted', ''), 
			'Policy ID' : ('id', ''), 
			'Title' : ('title', ''), 
			'Abstract' : ('abstract', ''), 
			'Date' : ('date', ''), 
			'GRID IDs' : ('research_orgs', 'convert_dict_ids'), 
			'Countries' : ('research_orgs', 'convert_country_name'), 
			'Funders' : ('funders', 'convert_dict_name'), 
			'Funders GRID IDs' : ('funders', 'convert_dict_ids'), 
			'Funders country' : ('funders', 'convert_country_name'), 
			'Research organizations' : ('research_orgs', 'convert_dict_name'), 
			'Source linkout' : ('linkout', ''), 
			'Dimensions URL' : ('id', 'convert_id_to_url', 'policy_documents'), 
			'Concepts' : ('concepts', 'convert_list'), 
			'FOR (ANZSRC) Categories' : ('category_for', 'convert_dict_name'), 
			'RCDC Categories' : ('category_rcdc', 'convert_dict_name'), 
			'HRCS HC Categories' : ('category_hrcs_hc', 'convert_dict_name'), 
			'HRCS RAC Categories' : ('category_hrcs_rac', 'convert_dict_name'), 
			'ICRP Cancer Types' : ('category_icrp_ct', 'convert_dict_name'), 
			'ICRP CSO Categories' : ('category_icrp_cso', 'convert_dict_name'), 
			'BRA Categories' : ('category_bra', 'convert_dict_name'), 
			'HRA Categories' : ('category_hra', 'convert_dict_name'), 
			'SDG Categories' : ('category_sdg', 'convert_dict_name'), 
		})





[docs]class DslOrganizationsConverter(DslDataConverter):
	"""@TODO review
	"""

	def __init__(self, df, verbose=False):

		super().__init__(df, "grants", verbose)

		self.column_transformations = OrderedDict({
			'GRID ID' : ('id', ''), 
			'Name' : ('name', ''), 
			# 'Dimensions URL' : ('id', 'convert_id_to_url', 'organizations'), 
		})




[docs]class DslResearchersConverter(DslDataConverter):
	"""@TODO review
	"""

	def __init__(self, df, verbose=False):

		super().__init__(df, "grants", verbose)

		self.column_transformations = OrderedDict({
			'Researcher ID' : ('id', ''), 
			'First Name' : ('first_name', ''), 
			'Last Name' : ('last_name', ''), 
			'Orcid IDs' : ('orcid_id', ''), 
			'Research organizations' : ('research_orgs', 'convert_dict_name'), 
			'GRID IDs' : ('research_orgs', 'convert_dict_ids'), 
			'Countries' : ('research_orgs', 'convert_country_name'), 
			'Dimensions URL' : ('id', 'convert_id_to_url', 'researchers'), 
		})





[docs]class DslReportsConverter(DslDataConverter):
	"""@TODO review
	"""

	def __init__(self, df, verbose=False):

		super().__init__(df, "grants", verbose)

		self.column_transformations = OrderedDict({
			'Report ID' : ('id', ''), 
			'Dimensions URL' : ('id', 'convert_id_to_url', 'reports'), 
		})




[docs]class DslSourceTitlesConverter(DslDataConverter):
	"""@TODO review
	"""

	def __init__(self, df, verbose=False):

		super().__init__(df, "grants", verbose)

		self.column_transformations = OrderedDict({
			'Source ID' : ('id', ''), 
			'Source title' : ('title', ''), 
			'Type' : ('type', ''), 
			'ISSNs' : ('issn', ''), 
			'Publisher' : ('publisher', ''), 
			'Start year' : ('start_year', ''), 
			'Dimensions URL' : ('id', 'convert_id_to_url', 'source_titles'), 
		})