Skip to content

scholarpy module

Main module.

Dsl (Dsl)

Source code in scholarpy/scholarpy.py
class Dsl(dimcli.Dsl):
    def __init__(self, key=None, **kwargs):
        """Initialize the DSL."""
        if key is None:
            key = os.environ.get("DIM_TOKEN")
            if key is None:
                webbrowser.open("https://api-lab.dimensions.ai")
                raise ValueError(
                    "No Dimensions API key can be found. Please go to https://www.dimensions.ai/contact-us to request an API key."
                )
            dimcli.login(key=key)

        super().__init__(**kwargs)

    def search_researcher_by_id(
        self,
        id,
        fields=None,
        iterative=False,
        limit=1000,
        return_df=False,
        **kwargs,
    ):
        """Search a researcher by ID.

        Args:
            id (str): The ID of the researcher. For example, ur.010551261751.12
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.
            return_df (bool, optional): If True, the results will be returned as a dataframe. Defaults to False.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """
        """Search a researcher by a Dimensions Research ID. For example, ur.010551261751.12"""

        if fields is None:
            fields = "[basics+extras]"

        query = f'search researchers where id="{id}" return researchers{fields}'

        if iterative:
            result = self.query_iterative(query, limit=limit, **kwargs)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)
        if return_df:
            df2 = result.as_dataframe().transpose()
            df3 = pd.DataFrame(
                {"index": df2.index.tolist(), "value": df2.values.tolist()}
            )
            df3["value"] = df3["value"]
            print(type(df3["value"]))
            return df3
        else:
            return result

    def search_researcher_by_name(
        self,
        name,
        fields=None,
        iterative=False,
        limit=1000,
        return_list=False,
        **kwargs,
    ):
        """Search a researcher by name.

        Args:
            name (str): The name of the researcher.
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.
            return_list (bool, optional): If True, the results will be returned as a list. Defaults to False.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """

        if fields is None:
            fields = "[basics+extras]"

        query = f'search researchers for "\\"{name}\\"" where obsolete=0 and total_publications>0 return researchers{fields}'
        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        if return_list:
            df = result.as_dataframe()
            if not df.empty:
                # df.sort_values(by=["id"], inplace=True)
                df["current_research_org_name"] = df["current_research_org.name"]
                df.sort_values("current_research_org_name", inplace=True)
                if not df.empty:
                    items = []
                    for row in df.itertuples():
                        item = (
                            str(row.first_name)
                            + " "
                            + str(row.last_name)
                            + " | "
                            + str(row.id)
                            + " | "
                            + str(row.current_research_org_name)
                        )
                        items.append(item)

                    return result, items
            else:
                return result, None
        else:
            return result

    def search_researcher_collaborators(self, id, pubs=None):
        """Search collaborators of a researcher.

        Args:
            id (str): The ID of the researcher. For example, ur.010551261751.12
            pubs (dimcli.DslDataset, optional): The publications of the researcher. Defaults to None.
        Returns:
            pd.DataFrame: A dataframe of the collaborators.
        """
        if pubs is None:
            pubs = self.search_pubs_by_researcher_id(id)
        df = pubs.as_dataframe_authors()
        df = df[df["researcher_id"] != id]
        df["Name"] = df["first_name"].str.split(
            " ").str[0] + " " + df["last_name"]
        names = df.drop_duplicates("Name").copy()
        affiliations = names["affiliations"].values.tolist()
        institutions = []
        for a in enumerate(affiliations):
            try:
                institution = a[1][0]["name"]
                institutions.append(institution)
            except:
                institutions.append("")

        names["Institution"] = institutions
        names = names[["Name", "Institution"]]

        result = pd.DataFrame(df["Name"].value_counts())
        result = pd.DataFrame(
            {"Name": result.index, "Count": result["Name"].values})
        return result.merge(names, on="Name")

    def search_orcid_by_name(
        self,
        name,
        fields=None,
        iterative=False,
        limit=1000,
        return_list=False,
        **kwargs,
    ):
        """Search a researcher orcid by name.

        Args:
            name (str): The name of the researcher.
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.
            return_list (bool, optional): If True, the results will be returned as a list. Defaults to False.

        Returns:
            pd.DataFrame: A dataframe of the results.
        """

        result = self.search_researcher_by_name(
            name, fields=fields, iterative=iterative, limit=limit, return_list=False
        )
        df = result.as_dataframe()
        if "orcid_id" in df.columns:
            df = df[~df["orcid_id"].isnull()]
            ids = [id[0] for id in df["orcid_id"].values.tolist()]
            df["orcid_id"] = ids
            # df.sort_values(by=["orcid_id"], inplace=True)
            df["current_research_org_name"] = df["current_research_org.name"]
            df.sort_values("current_research_org_name", inplace=True)
            if return_list:
                df["uid"] = (
                    df["first_name"]
                    + " "
                    + df["last_name"]
                    + " | "
                    + df["orcid_id"]
                    + " | "
                    + df["current_research_org.name"]
                )
                df = df[~df["uid"].isnull()]
                return df["uid"].tolist()
            else:
                return df
        else:
            return None

    def search_journal_by_id(self, id, fields=None, **kwargs):
        """Search a journal by ID.

        Args:
            id (str): The ID of the journal. For example, jour.1018957
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """

        if fields is None:
            fields = ""
        query = f'search source_titles where id="{id}" return source_titles{fields}'
        return self.query(query)

    def search_journal_by_title(
        self,
        title,
        exact_match=True,
        fields=None,
        iterative=False,
        limit=1000,
        **kwargs,
    ):
        """Search a journal by title.

        Args:
            title (str): The title of the journal.
            exact_match (bool, optional): If True, the title must be an exact match. Defaults to True.
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """
        if fields is None:
            fields = ""
        query = f'search source_titles for "{title}" return source_titles{fields}'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        try:
            if exact_match:
                df = result.as_dataframe()
                sub_df = df[df["title"].str.lower() == title.lower()]
                journal_id = sub_df["id"].values.tolist()[0]
                query = f'search source_titles where id="{journal_id}" return source_titles{fields}'
                return self.query(query)

            else:
                return result
        except Exception as e:
            print("No journal can be found.")
            return None

    def search_org_by_id(self, id, fields=None, **kwargs):
        """Search an organization by ID. For example, grid.411461.7

        Args:
            id (str): The ID of the organization. For example, org.010551261751.12
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """
        if fields is None:
            fields = ""
        query = f'search organizations where id="{id}" return organizations{fields}'
        return self.query(query)

    def search_org_by_name(
        self, name, exact_match=True, fields=None, iterative=False, limit=1000, return_list=False, **kwargs
    ):
        """Search an organization by name.

        Args:
            name (str): The name of the organization.
            exact_match (bool, optional): If True, the name must be an exact match. Defaults to True.
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.
            return_list (bool, optional): If True, the results will be returned as a list. Defaults to False.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """
        if fields is None:
            fields = ""
        query = f'search organizations for "\\"{name}\\"" return organizations{fields}'
        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        try:
            if exact_match:
                df = result.as_dataframe()
                sub_df = df[df["name"].str.lower() == name.lower()]
                org_id = sub_df["id"].values.tolist()[0]
                query = f'search organizations where id="{org_id}" return organizations{fields}'
                result = self.query(query)

            if return_list:

                df = result.as_dataframe()
                if not df.empty:
                    df.sort_values("name", inplace=True)
                    df["name_id"] = df["id"] + " | " + df["name"]
                    orgs = df["name_id"].values.tolist()
                    return orgs
                else:
                    return None
            else:
                return result

        except Exception as e:
            print("No organization can be found.")
            return None

    def search_pubs_by_researcher_id(
        self,
        id,
        start_year=None,
        end_year=None,
        fields=None,
        iterative=False,
        limit=1000,
        **kwargs,
    ):
        """Search publications by researcher ID.

        Args:
            id (str): The ID of the researcher. For example, res.010551261751.12
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """
        if fields is None:
            fields = "[basics+authors_count+times_cited+dimensions_url]"

        if (start_year is not None) and (end_year is not None):
            query = f'search publications where researchers.id="{id}" and year>={start_year} and year<={end_year} return publications{fields}'
        elif start_year is not None:
            query = f'search publications where researchers.id="{id}" and year>={start_year} return publications{fields}'
        elif end_year is not None:
            query = f'search publications where researchers.id="{id}" and year<={end_year} return publications{fields}'
        else:
            query = f'search publications where researchers.id="{id}" return publications{fields}'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        return result

    def search_pubs_by_journal_id(
        self,
        id,
        start_year=None,
        end_year=None,
        fields=None,
        iterative=False,
        limit=1000,
        **kwargs,
    ):
        """Search publications by journal ID.

        Args:
            id (str): The ID of the journal. For example, jour.1018957
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """
        if fields is None:
            fields = ""

        if (start_year is not None) and (end_year is not None):
            query = f'search publications where journal.id="{id}" and year>={start_year} and year<={end_year} return publications{fields}'
        elif start_year is not None:
            query = f'search publications where journal.id="{id}" and year>={start_year} return publications{fields}'
        elif end_year is not None:
            query = f'search publications where journal.id="{id}" and year<={end_year} return publications{fields}'
        else:
            query = f'search publications where journal.id="{id}" return publications{fields}'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        return result

    def search_pubs_by_org_id(
        self,
        id,
        start_year=None,
        end_year=None,
        fields=None,
        iterative=False,
        limit=1000,
        **kwargs,
    ):
        """Search publications by organization ID.

        Args:
            id (str): The ID of the organization. For example, grid.411461.7
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """
        if fields is None:
            fields = ""

        if (start_year is not None) and (end_year is not None):
            query = f'search publications where research_orgs="{id}" and year>={start_year} and year<={end_year} return publications{fields}'
        elif start_year is not None:
            query = f'search publications where research_orgs="{id}" and year>={start_year} return publications{fields}'
        elif end_year is not None:
            query = f'search publications where research_orgs="{id}" and year<={end_year} return publications{fields}'
        else:
            query = f'search publications where research_orgs="{id}" return publications{fields}'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        return result

    def search_pubs_by_keyword(
        self,
        keyword,
        exact_match=True,
        scope="title_abstract_only",
        start_year=None,
        end_year=None,
        journal_id=None,
        fields=None,
        sorted_field="times_cited",
        iterative=False,
        limit=1000,
        **kwargs,
    ):
        """Search publications by keyword.

        Args:
            keyword (str): The keyword to search.
            exact_match (bool, optional): If True, the keyword will be matched exactly. Defaults to True.
            scope (str, optional): The scope of the search. Defaults to "title_abstract_only".
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
            sorted_field (str, optional): The field to sort by. Defaults to "times_cited".
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.

        Raises:
            ValueError: [description]

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """
        if exact_match:
            exact = '\\"'
        else:
            exact = ""

        if journal_id is not None:
            journal = f'and journal.id="{journal_id}"'
        else:
            journal = ""

        allowed_scopes = [
            "authors",
            "concepts",
            "full_data",
            "full_data_exact",
            "title_abstract_only",
            "title_only",
        ]

        if scope not in allowed_scopes:
            raise ValueError(f"scope must be one of {allowed_scopes}")

        if fields is None:
            fields = "[basics+altmetric+times_cited+field_citation_ratio+authors_count+doi+dimensions_url]"

        if (start_year is not None) and (end_year is not None):
            query = f'search publications in {scope} for "{exact}{keyword}{exact}" where year>={start_year} and year<={end_year} {journal} return publications{fields} sort by {sorted_field}'
        elif start_year is not None:
            query = f'search publications in {scope} for "{exact}{keyword}{exact}" where year>={start_year} {journal} return publications{fields} sort by {sorted_field}'
        elif end_year is not None:
            query = f'search publications in {scope} for "{exact}{keyword}{exact}" where year<={end_year} {journal} return publications{fields} sort by {sorted_field}'
        else:
            if journal_id is not None:
                journal = f'where journal.id="{journal_id}"'
            else:
                journal = ""
            query = f'search publications in {scope} for "{exact}{keyword}{exact}" {journal} return publications{fields} sort by {sorted_field}'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        return result

    def search_grants_by_keyword(
        self,
        keyword,
        exact_match=True,
        scope="title_only",
        start_year=None,
        end_year=None,
        fields=None,
        sorted_field="start_year",
        iterative=False,
        limit=1000,
        **kwargs,
    ):

        if exact_match:
            exact = '\\"'
        else:
            exact = ""

        allowed_scopes = [
            "concepts",
            "full_data",
            "investigators",
            "title_abstract_only",
            "title_only",
        ]

        if scope not in allowed_scopes:
            raise ValueError(f"scope must be one of {allowed_scopes}")

        if fields is None:
            fields = "[basics+extras+dimensions_url]"

        if end_year is not None:
            end_year = str(end_year)
            if len(str(end_year)) != 4:
                raise ValueError("end_year must be a 4-digit number")
            else:
                end_year = str(end_year) + "-12-31"

        if (start_year is not None) and (end_year is not None):
            query = f'search grants in {scope} for "{exact}{keyword}{exact}" where start_year>={start_year} and end_date<="{end_year}" return grants{fields} sort by {sorted_field}'
        elif start_year is not None:
            query = f'search grants in {scope} for "{exact}{keyword}{exact}" where start_year>={start_year} return grants{fields} sort by {sorted_field}'
        elif end_year is not None:
            query = f'search grants in {scope} for "{exact}{keyword}{exact}" where end_date<="{end_year}" return grants{fields} sort by {sorted_field}'
        else:
            query = f'search grants in {scope} for "{exact}{keyword}{exact}" return grants{fields} sort by {sorted_field}'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        return result

    def h_index(self, id, iterative=False, limit=1000):
        """Get the h-index of a researcher.

        Args:
            id (str): The ID of the researcher. For example, res.010551261751.12
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.
        """

        def the_H_function(sorted_citations_list, n=1):
            """from a list of integers [n1, n2 ..] representing publications citations,
            return the max list-position which is >= integer

            eg
            >>> the_H_function([10, 8, 5, 4, 3]) => 4
            >>> the_H_function([25, 8, 5, 3, 3]) => 3
            >>> the_H_function([1000, 20]) => 2
            """
            if sorted_citations_list and sorted_citations_list[0] >= n:
                return the_H_function(sorted_citations_list[1:], n + 1)
            else:
                return n - 1

        def get_pubs_citations(researcher_id, iterative=False, limit=1000):
            q = f'search publications where researchers.id = "{researcher_id}" return publications[times_cited] sort by times_cited'

            if iterative:
                result = self.query_iterative(q, limit=limit)
            else:
                q = f"{q} limit {limit}"
                result = self.query(q)

            return list(result.as_dataframe().fillna(0)["times_cited"])

        return the_H_function(get_pubs_citations(id, iterative=iterative, limit=limit))

    def researcher_pubs_stats(
        self,
        id,
        start_year=None,
        end_year=None,
        iterative=False,
        limit=1000,
        return_plot=False,
        **kwargs,
    ):
        """Get the publications stats of a researcher.

        Args:
            id (str): The ID of the researcher. For example, ur.010551261751.12
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.
            return_plot (bool, optional): If True, the plot will be returned. Defaults to False.

        Returns:
            pandas.DataFrame: The dataframe of the results.
        """

        result = self.search_pubs_by_researcher_id(
            id, start_year, end_year, iterative=iterative, limit=limit
        )
        pubs = result.as_dataframe()
        df = pubs["year"].value_counts().sort_index()
        df2 = pd.DataFrame({"year": df.index, "citations": df.values})
        if return_plot:
            return df.plot.bar(**kwargs)
        else:
            return df2

    def researcher_pubs_authors(
        self, id, start_year=None, end_year=None, iterative=False, limit=1000
    ):
        """Get the authors of a researcher's publications.

        Args:
            id (str): The ID of the researcher. For example, ur.010551261751.12
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.

        Returns:
            pandas.DataFrame: The dataframe of the results.
        """

        result = self.search_pubs_by_researcher_id(
            id, start_year, end_year, iterative=iterative, limit=limit
        )
        pubs = result.as_dataframe()
        authors = result.as_dataframe_authors()["pub_id"].value_counts()
        df2 = pd.DataFrame({"id": authors.index, "authors": authors.values})
        df = pubs.join(df2.set_index("id"), on="id")
        return df

    def researcher_annual_stats(self, data, geonames_df=None):
        """Get the annual stats of a researcher, including publications, collaborators, collaborating institutions, and cities.

        Args:
            data (dimcli.DslDataset): JSON data of the input. It can be derived from search_pubs_by_researcher_id().
            geonames_df (pd.DataFrame, optional): The geonames dataframe. Defaults to None.

        Returns:
            pd.DataFrame: The dataframe of the results.
        """
        pubs = data.as_dataframe()
        years_dict = pubs[["id", "year"]].set_index("id").to_dict()["year"]
        df = data.as_dataframe_authors()
        df["name"] = df["first_name"] + " " + df["last_name"]
        affiliations = df["affiliations"].values.tolist()

        institutions = []
        city_ids = []
        years = []
        cities = []
        countries = []
        latitudes = []
        longitudes = []

        ids = df["pub_id"].values.tolist()
        for index, a in enumerate(affiliations):
            try:
                institution = a[0]["name"]
                institutions.append(institution)
            except:
                institutions.append("")

            try:
                city_id = a[0]["city_id"]
                city_ids.append(city_id)
            except:
                city_ids.append(0)

            try:
                city = a[0]["city"]
                cities.append(city)
            except:
                cities.append("")

            try:
                country = a[0]["country"]
                countries.append(country)
            except:
                countries.append("")

            if geonames_df is not None:
                try:
                    latitude, longitude = geoname_latlon(
                        city_ids[-1], geonames_df)
                    latitudes.append(latitude)
                    longitudes.append(longitude)
                except:
                    latitudes.append(0)
                    longitudes.append(0)

            years.append((years_dict[ids[index]]))

        df["year"] = years
        df["institution"] = institutions
        df["city"] = cities
        df["country"] = countries
        df["city_id"] = city_ids

        if geonames_df is not None:
            df["latitude"] = latitudes
            df["longitude"] = longitudes

        pubs_stats = pubs.groupby("year").size()
        collaborators_stats = (
            df.groupby(["year", "name"]).size().groupby(level=0).size()
        )
        institutions_stats = (
            df.groupby(["year", "institution"]).size().groupby(level=0).size()
        )
        cities_stats = df.groupby(
            ["year", "city_id"]).size().groupby(level=0).size()

        df2 = pd.DataFrame(
            {
                "year": cities_stats.index,
                "pubs": pubs_stats,
                "collaborators": collaborators_stats,
                "institutions": institutions_stats,
                "cities": cities_stats,
            }
        )
        if geonames_df is None:
            return df2
        else:
            return (
                df2,
                df[
                    [
                        "name",
                        "year",
                        "institution",
                        "city",
                        "country",
                        "latitude",
                        "longitude",
                    ]
                ],
            )

    def org_pubs_annual_stats(self, org_id, start_year=None, end_year=None, iterative=False, limit=1000, return_plot=False, **kwargs):
        """Search publications by organization ID.

        Args:
            org_id (str): The ID of the organization. For example, grid.411461.7
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.
            return_plot (bool, optional): If True, the plot of the results will be returned. Defaults to False.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """

        if (start_year is not None) and (end_year is not None):
            query = f'search publications where research_orgs="{org_id}" and year>={start_year} and year<={end_year} return year'
        elif start_year is not None:
            query = f'search publications where research_orgs="{org_id}" and year>={start_year} return year'
        elif end_year is not None:
            query = f'search publications where research_orgs="{org_id}" and year<={end_year} return year'
        else:
            query = f'search publications where research_orgs="{org_id}" return year'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        df = result.as_dataframe()
        df.rename(columns={"id": "year"}, inplace=True)

        if not return_plot:
            return df
        else:
            if not df.empty:
                org_name = self.search_org_by_id(
                    org_id).as_dataframe()["name"][0]
                fig = px.bar(df, x="year", y="count",
                             title=f"Publications from {org_name} - by year")
                return df, fig
            else:
                return df, None

    def org_grants_annual_stats(self, org_id, start_year=None, end_year=None, iterative=False, limit=100, return_plot=False, **kwargs):
        """Search publications by organization ID.

        Args:
            org_id (str): The ID of the organization. For example, grid.411461.7
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.
            return_plot (bool, optional): If True, the plot of the results will be returned. Defaults to False.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """

        if (start_year is not None) and (end_year is not None):
            # query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} and end_date<="{end_year}-12-31" return start_year aggregate funding'
            query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} return start_year aggregate funding'
        elif start_year is not None:
            query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} return start_year aggregate funding'
        elif end_year is not None:
            query = f'search grants where research_orgs="{org_id}" return start_year aggregate funding'
        else:
            query = f'search grants where research_orgs="{org_id}" return start_year aggregate funding'

        # if limit is None and start_year is not None and end_year is not None:
        #     limit = end_year - start_year + 1
        # else:
        #     limit = 30

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        df = result.as_dataframe()
        df.rename(columns={"id": "year"}, inplace=True)

        if not return_plot:
            return df
        else:
            if not df.empty:
                org_name = self.search_org_by_id(
                    org_id).as_dataframe()["name"][0]
                fig_count = px.bar(df, x="year", y="count",
                                   title=f"The number of grants for {org_name} - by year")
                fig_amount = px.bar(df, x="year", y="funding",
                                    title=f"The funding amount for {org_name} - by year")

                return df, fig_count, fig_amount
            else:
                return df, None, None

    def org_grant_funders(self, org_id, start_year=None, end_year=None, iterative=False, limit=20, return_plot=False, **kwargs):
        """Top funders of an organization.

        Args:
            org_id (str): The ID of the organization. For example, grid.411461.7
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 20.
            return_plot (bool, optional): If True, the plot of the results will be returned. Defaults to False.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """

        if (start_year is not None) and (end_year is not None):
            query = f'search grants where research_orgs="{org_id}" and year>={start_year} and year<={end_year} return funders aggregate funding sort by funding'
        elif start_year is not None:
            query = f'search grants where research_orgs="{org_id}" and year>={start_year} return funders aggregate funding sort by funding'
        elif end_year is not None:
            query = f'search grants where research_orgs="{org_id}" and year<={end_year} return funders aggregate funding sort by funding'
        else:
            query = f'search grants where research_orgs="{org_id}" return funders aggregate funding sort by funding'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        df = result.as_dataframe()
        if not return_plot:
            return df
        else:
            if not df.empty:
                org_name = self.search_org_by_id(
                    org_id).as_dataframe()["name"][0]
                fig = px.bar(df,
                             x="name", y="funding",
                             title=f"Funding for {org_name} - by funder")
                return df, fig
            else:
                return df, None

    def search_grants_by_org(self, org_id, start_year=None, end_year=None, iterative=False, limit=1000, **kwargs):
        """Search grants by organization id.

        Args:
            org_id (str): The ID of the organization. For example, grid.411461.7
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 20.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """

        if (start_year is not None) and (end_year is not None):
            query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} return grants[basics+extras]'
        elif start_year is not None:
            query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} return grants[basics+extras]'
        elif end_year is not None:
            query = f'search grants where research_orgs="{org_id}" and start_year<={end_year} return grants[basics+extras]'
        else:
            query = f'search grants where research_orgs="{org_id}" return grants[basics+extras]'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        return result

    def search_grants_by_researcher(self, researcher_id, start_year=None, end_year=None, iterative=False, limit=1000, **kwargs):
        """Search grants by researcher id.

        Args:
            researcher_id (str): The ID of the research. For example, ur.01361677540.55
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """

        if (start_year is not None) and (end_year is not None):
            query = f'search grants where researchers="{researcher_id}" and start_year>={start_year} return grants[basics+extras]'
        elif start_year is not None:
            query = f'search grants where researchers="{researcher_id}" and start_year>={start_year} return grants[basics+extras]'
        elif end_year is not None:
            query = f'search grants where researchers="{researcher_id}" and start_year<={end_year} return grants[basics+extras]'
        else:
            query = f'search grants where researchers="{researcher_id}" return grants[basics+extras]'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        return result

    def org_pubs_most_cited(self, org_id, recent=False, iterative=False, limit=100, **kwargs):

        if recent:
            query = f"""search publications where research_orgs.id="{org_id}"
            return publications[basics+recent_citations]
            sort by recent_citations"""
        else:
            query = f"""search publications where research_orgs.id="{org_id}"
            return publications[basics+times_cited]
            sort by times_cited"""

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        return result

    def org_pubs_top_areas(self, org_id, iterative=False, limit=1000, return_plot=False, **kwargs):

            query = f"""search publications where research_orgs.id="{org_id}"
            return publications[doi+title+times_cited+category_for+journal]
            sort by times_cited"""

            if iterative:
                result = self.query_iterative(query, limit=limit)
            else:
                query = f"{query} limit {limit}"
                result = self.query(query)    

            raw_df = result.as_dataframe()
            dimcli.utils.normalize_key("category_for", result.publications, [])
            df = pd.json_normalize(result.publications, record_path='category_for', meta=['doi', 'title', 'times_cited', ], errors='ignore' )
            org_name = self.search_org_by_id(org_id).as_dataframe()["name"][0]
            if return_plot:
                area_fig = px.scatter(df,
                    x="times_cited", y="name",
                    marginal_x="histogram",
                    marginal_y="histogram",
                    hover_data=["doi", "title"],
                    height=600,
                    title=f"Publications from {org_name} - Research Areas vs. Citations")
                # return df, area_fig
                journal_fig = px.scatter(raw_df,
                            x="times_cited", y="journal.title",
                            marginal_x="histogram",
                            marginal_y="histogram",
                            height=600,
                            title=f"Publications from {org_name} - Journals vs. Citations")

                return df, area_fig, journal_fig
            else:
                return df

__init__(self, key=None, **kwargs) special

Initialize the DSL.

Source code in scholarpy/scholarpy.py
def __init__(self, key=None, **kwargs):
    """Initialize the DSL."""
    if key is None:
        key = os.environ.get("DIM_TOKEN")
        if key is None:
            webbrowser.open("https://api-lab.dimensions.ai")
            raise ValueError(
                "No Dimensions API key can be found. Please go to https://www.dimensions.ai/contact-us to request an API key."
            )
        dimcli.login(key=key)

    super().__init__(**kwargs)

h_index(self, id, iterative=False, limit=1000)

Get the h-index of a researcher.

Parameters:

Name Type Description Default
id str

The ID of the researcher. For example, res.010551261751.12

required
iterative bool

If True, the query will be iterative. Defaults to False.

False
limit int

The number of results to return. Defaults to 1000.

1000
Source code in scholarpy/scholarpy.py
def h_index(self, id, iterative=False, limit=1000):
    """Get the h-index of a researcher.

    Args:
        id (str): The ID of the researcher. For example, res.010551261751.12
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.
    """

    def the_H_function(sorted_citations_list, n=1):
        """from a list of integers [n1, n2 ..] representing publications citations,
        return the max list-position which is >= integer

        eg
        >>> the_H_function([10, 8, 5, 4, 3]) => 4
        >>> the_H_function([25, 8, 5, 3, 3]) => 3
        >>> the_H_function([1000, 20]) => 2
        """
        if sorted_citations_list and sorted_citations_list[0] >= n:
            return the_H_function(sorted_citations_list[1:], n + 1)
        else:
            return n - 1

    def get_pubs_citations(researcher_id, iterative=False, limit=1000):
        q = f'search publications where researchers.id = "{researcher_id}" return publications[times_cited] sort by times_cited'

        if iterative:
            result = self.query_iterative(q, limit=limit)
        else:
            q = f"{q} limit {limit}"
            result = self.query(q)

        return list(result.as_dataframe().fillna(0)["times_cited"])

    return the_H_function(get_pubs_citations(id, iterative=iterative, limit=limit))

org_grant_funders(self, org_id, start_year=None, end_year=None, iterative=False, limit=20, return_plot=False, **kwargs)

Top funders of an organization.

Parameters:

Name Type Description Default
org_id str

The ID of the organization. For example, grid.411461.7

required
start_year int

The start year of the publication. Defaults to None.

None
end_year int

The end year of the publication. Defaults to None.

None
iterative bool

If True, the query will be iterative. Defaults to False.

False
limit int

The number of results to return. Defaults to 20.

20
return_plot bool

If True, the plot of the results will be returned. Defaults to False.

False

Returns:

Type Description
dimcli.DslDataset

JSON data of the results.

Source code in scholarpy/scholarpy.py
def org_grant_funders(self, org_id, start_year=None, end_year=None, iterative=False, limit=20, return_plot=False, **kwargs):
    """Top funders of an organization.

    Args:
        org_id (str): The ID of the organization. For example, grid.411461.7
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 20.
        return_plot (bool, optional): If True, the plot of the results will be returned. Defaults to False.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """

    if (start_year is not None) and (end_year is not None):
        query = f'search grants where research_orgs="{org_id}" and year>={start_year} and year<={end_year} return funders aggregate funding sort by funding'
    elif start_year is not None:
        query = f'search grants where research_orgs="{org_id}" and year>={start_year} return funders aggregate funding sort by funding'
    elif end_year is not None:
        query = f'search grants where research_orgs="{org_id}" and year<={end_year} return funders aggregate funding sort by funding'
    else:
        query = f'search grants where research_orgs="{org_id}" return funders aggregate funding sort by funding'

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    df = result.as_dataframe()
    if not return_plot:
        return df
    else:
        if not df.empty:
            org_name = self.search_org_by_id(
                org_id).as_dataframe()["name"][0]
            fig = px.bar(df,
                         x="name", y="funding",
                         title=f"Funding for {org_name} - by funder")
            return df, fig
        else:
            return df, None

org_grants_annual_stats(self, org_id, start_year=None, end_year=None, iterative=False, limit=100, return_plot=False, **kwargs)

Search publications by organization ID.

Parameters:

Name Type Description Default
org_id str

The ID of the organization. For example, grid.411461.7

required
start_year int

The start year of the publication. Defaults to None.

None
end_year int

The end year of the publication. Defaults to None.

None
iterative bool

If True, the query will be iterative. Defaults to False.

False
limit int

The number of results to return. Defaults to 1000.

100
return_plot bool

If True, the plot of the results will be returned. Defaults to False.

False

Returns:

Type Description
dimcli.DslDataset

JSON data of the results.

Source code in scholarpy/scholarpy.py
def org_grants_annual_stats(self, org_id, start_year=None, end_year=None, iterative=False, limit=100, return_plot=False, **kwargs):
    """Search publications by organization ID.

    Args:
        org_id (str): The ID of the organization. For example, grid.411461.7
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.
        return_plot (bool, optional): If True, the plot of the results will be returned. Defaults to False.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """

    if (start_year is not None) and (end_year is not None):
        # query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} and end_date<="{end_year}-12-31" return start_year aggregate funding'
        query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} return start_year aggregate funding'
    elif start_year is not None:
        query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} return start_year aggregate funding'
    elif end_year is not None:
        query = f'search grants where research_orgs="{org_id}" return start_year aggregate funding'
    else:
        query = f'search grants where research_orgs="{org_id}" return start_year aggregate funding'

    # if limit is None and start_year is not None and end_year is not None:
    #     limit = end_year - start_year + 1
    # else:
    #     limit = 30

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    df = result.as_dataframe()
    df.rename(columns={"id": "year"}, inplace=True)

    if not return_plot:
        return df
    else:
        if not df.empty:
            org_name = self.search_org_by_id(
                org_id).as_dataframe()["name"][0]
            fig_count = px.bar(df, x="year", y="count",
                               title=f"The number of grants for {org_name} - by year")
            fig_amount = px.bar(df, x="year", y="funding",
                                title=f"The funding amount for {org_name} - by year")

            return df, fig_count, fig_amount
        else:
            return df, None, None

org_pubs_annual_stats(self, org_id, start_year=None, end_year=None, iterative=False, limit=1000, return_plot=False, **kwargs)

Search publications by organization ID.

Parameters:

Name Type Description Default
org_id str

The ID of the organization. For example, grid.411461.7

required
start_year int

The start year of the publication. Defaults to None.

None
end_year int

The end year of the publication. Defaults to None.

None
iterative bool

If True, the query will be iterative. Defaults to False.

False
limit int

The number of results to return. Defaults to 1000.

1000
return_plot bool

If True, the plot of the results will be returned. Defaults to False.

False

Returns:

Type Description
dimcli.DslDataset

JSON data of the results.

Source code in scholarpy/scholarpy.py
def org_pubs_annual_stats(self, org_id, start_year=None, end_year=None, iterative=False, limit=1000, return_plot=False, **kwargs):
    """Search publications by organization ID.

    Args:
        org_id (str): The ID of the organization. For example, grid.411461.7
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.
        return_plot (bool, optional): If True, the plot of the results will be returned. Defaults to False.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """

    if (start_year is not None) and (end_year is not None):
        query = f'search publications where research_orgs="{org_id}" and year>={start_year} and year<={end_year} return year'
    elif start_year is not None:
        query = f'search publications where research_orgs="{org_id}" and year>={start_year} return year'
    elif end_year is not None:
        query = f'search publications where research_orgs="{org_id}" and year<={end_year} return year'
    else:
        query = f'search publications where research_orgs="{org_id}" return year'

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    df = result.as_dataframe()
    df.rename(columns={"id": "year"}, inplace=True)

    if not return_plot:
        return df
    else:
        if not df.empty:
            org_name = self.search_org_by_id(
                org_id).as_dataframe()["name"][0]
            fig = px.bar(df, x="year", y="count",
                         title=f"Publications from {org_name} - by year")
            return df, fig
        else:
            return df, None

researcher_annual_stats(self, data, geonames_df=None)

Get the annual stats of a researcher, including publications, collaborators, collaborating institutions, and cities.

Parameters:

Name Type Description Default
data dimcli.DslDataset

JSON data of the input. It can be derived from search_pubs_by_researcher_id().

required
geonames_df pd.DataFrame

The geonames dataframe. Defaults to None.

None

Returns:

Type Description
pd.DataFrame

The dataframe of the results.

Source code in scholarpy/scholarpy.py
def researcher_annual_stats(self, data, geonames_df=None):
    """Get the annual stats of a researcher, including publications, collaborators, collaborating institutions, and cities.

    Args:
        data (dimcli.DslDataset): JSON data of the input. It can be derived from search_pubs_by_researcher_id().
        geonames_df (pd.DataFrame, optional): The geonames dataframe. Defaults to None.

    Returns:
        pd.DataFrame: The dataframe of the results.
    """
    pubs = data.as_dataframe()
    years_dict = pubs[["id", "year"]].set_index("id").to_dict()["year"]
    df = data.as_dataframe_authors()
    df["name"] = df["first_name"] + " " + df["last_name"]
    affiliations = df["affiliations"].values.tolist()

    institutions = []
    city_ids = []
    years = []
    cities = []
    countries = []
    latitudes = []
    longitudes = []

    ids = df["pub_id"].values.tolist()
    for index, a in enumerate(affiliations):
        try:
            institution = a[0]["name"]
            institutions.append(institution)
        except:
            institutions.append("")

        try:
            city_id = a[0]["city_id"]
            city_ids.append(city_id)
        except:
            city_ids.append(0)

        try:
            city = a[0]["city"]
            cities.append(city)
        except:
            cities.append("")

        try:
            country = a[0]["country"]
            countries.append(country)
        except:
            countries.append("")

        if geonames_df is not None:
            try:
                latitude, longitude = geoname_latlon(
                    city_ids[-1], geonames_df)
                latitudes.append(latitude)
                longitudes.append(longitude)
            except:
                latitudes.append(0)
                longitudes.append(0)

        years.append((years_dict[ids[index]]))

    df["year"] = years
    df["institution"] = institutions
    df["city"] = cities
    df["country"] = countries
    df["city_id"] = city_ids

    if geonames_df is not None:
        df["latitude"] = latitudes
        df["longitude"] = longitudes

    pubs_stats = pubs.groupby("year").size()
    collaborators_stats = (
        df.groupby(["year", "name"]).size().groupby(level=0).size()
    )
    institutions_stats = (
        df.groupby(["year", "institution"]).size().groupby(level=0).size()
    )
    cities_stats = df.groupby(
        ["year", "city_id"]).size().groupby(level=0).size()

    df2 = pd.DataFrame(
        {
            "year": cities_stats.index,
            "pubs": pubs_stats,
            "collaborators": collaborators_stats,
            "institutions": institutions_stats,
            "cities": cities_stats,
        }
    )
    if geonames_df is None:
        return df2
    else:
        return (
            df2,
            df[
                [
                    "name",
                    "year",
                    "institution",
                    "city",
                    "country",
                    "latitude",
                    "longitude",
                ]
            ],
        )

researcher_pubs_authors(self, id, start_year=None, end_year=None, iterative=False, limit=1000)

Get the authors of a researcher's publications.

Parameters:

Name Type Description Default
id str

The ID of the researcher. For example, ur.010551261751.12

required
start_year int

The start year of the publication. Defaults to None.

None
end_year int

The end year of the publication. Defaults to None.

None
iterative bool

If True, the query will be iterative. Defaults to False.

False
limit int

The number of results to return. Defaults to 1000.

1000

Returns:

Type Description
pandas.DataFrame

The dataframe of the results.

Source code in scholarpy/scholarpy.py
def researcher_pubs_authors(
    self, id, start_year=None, end_year=None, iterative=False, limit=1000
):
    """Get the authors of a researcher's publications.

    Args:
        id (str): The ID of the researcher. For example, ur.010551261751.12
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.

    Returns:
        pandas.DataFrame: The dataframe of the results.
    """

    result = self.search_pubs_by_researcher_id(
        id, start_year, end_year, iterative=iterative, limit=limit
    )
    pubs = result.as_dataframe()
    authors = result.as_dataframe_authors()["pub_id"].value_counts()
    df2 = pd.DataFrame({"id": authors.index, "authors": authors.values})
    df = pubs.join(df2.set_index("id"), on="id")
    return df

researcher_pubs_stats(self, id, start_year=None, end_year=None, iterative=False, limit=1000, return_plot=False, **kwargs)

Get the publications stats of a researcher.

Parameters:

Name Type Description Default
id str

The ID of the researcher. For example, ur.010551261751.12

required
start_year int

The start year of the publication. Defaults to None.

None
end_year int

The end year of the publication. Defaults to None.

None
iterative bool

If True, the query will be iterative. Defaults to False.

False
limit int

The number of results to return. Defaults to 1000.

1000
return_plot bool

If True, the plot will be returned. Defaults to False.

False

Returns:

Type Description
pandas.DataFrame

The dataframe of the results.

Source code in scholarpy/scholarpy.py
def researcher_pubs_stats(
    self,
    id,
    start_year=None,
    end_year=None,
    iterative=False,
    limit=1000,
    return_plot=False,
    **kwargs,
):
    """Get the publications stats of a researcher.

    Args:
        id (str): The ID of the researcher. For example, ur.010551261751.12
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.
        return_plot (bool, optional): If True, the plot will be returned. Defaults to False.

    Returns:
        pandas.DataFrame: The dataframe of the results.
    """

    result = self.search_pubs_by_researcher_id(
        id, start_year, end_year, iterative=iterative, limit=limit
    )
    pubs = result.as_dataframe()
    df = pubs["year"].value_counts().sort_index()
    df2 = pd.DataFrame({"year": df.index, "citations": df.values})
    if return_plot:
        return df.plot.bar(**kwargs)
    else:
        return df2

search_grants_by_org(self, org_id, start_year=None, end_year=None, iterative=False, limit=1000, **kwargs)

Search grants by organization id.

Parameters:

Name Type Description Default
org_id str

The ID of the organization. For example, grid.411461.7

required
start_year int

The start year of the publication. Defaults to None.

None
end_year int

The end year of the publication. Defaults to None.

None
iterative bool

If True, the query will be iterative. Defaults to False.

False
limit int

The number of results to return. Defaults to 20.

1000

Returns:

Type Description
dimcli.DslDataset

JSON data of the results.

Source code in scholarpy/scholarpy.py
def search_grants_by_org(self, org_id, start_year=None, end_year=None, iterative=False, limit=1000, **kwargs):
    """Search grants by organization id.

    Args:
        org_id (str): The ID of the organization. For example, grid.411461.7
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 20.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """

    if (start_year is not None) and (end_year is not None):
        query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} return grants[basics+extras]'
    elif start_year is not None:
        query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} return grants[basics+extras]'
    elif end_year is not None:
        query = f'search grants where research_orgs="{org_id}" and start_year<={end_year} return grants[basics+extras]'
    else:
        query = f'search grants where research_orgs="{org_id}" return grants[basics+extras]'

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    return result

search_grants_by_researcher(self, researcher_id, start_year=None, end_year=None, iterative=False, limit=1000, **kwargs)

Search grants by researcher id.

Parameters:

Name Type Description Default
researcher_id str

The ID of the research. For example, ur.01361677540.55

required
start_year int

The start year of the publication. Defaults to None.

None
end_year int

The end year of the publication. Defaults to None.

None
iterative bool

If True, the query will be iterative. Defaults to False.

False
limit int

The number of results to return. Defaults to 1000.

1000

Returns:

Type Description
dimcli.DslDataset

JSON data of the results.

Source code in scholarpy/scholarpy.py
def search_grants_by_researcher(self, researcher_id, start_year=None, end_year=None, iterative=False, limit=1000, **kwargs):
    """Search grants by researcher id.

    Args:
        researcher_id (str): The ID of the research. For example, ur.01361677540.55
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """

    if (start_year is not None) and (end_year is not None):
        query = f'search grants where researchers="{researcher_id}" and start_year>={start_year} return grants[basics+extras]'
    elif start_year is not None:
        query = f'search grants where researchers="{researcher_id}" and start_year>={start_year} return grants[basics+extras]'
    elif end_year is not None:
        query = f'search grants where researchers="{researcher_id}" and start_year<={end_year} return grants[basics+extras]'
    else:
        query = f'search grants where researchers="{researcher_id}" return grants[basics+extras]'

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    return result

search_journal_by_id(self, id, fields=None, **kwargs)

Search a journal by ID.

Parameters:

Name Type Description Default
id str

The ID of the journal. For example, jour.1018957

required
fields str

The fields to return. For example, [basics+extras]. Defaults to None.

None

Returns:

Type Description
dimcli.DslDataset

JSON data of the results.

Source code in scholarpy/scholarpy.py
def search_journal_by_id(self, id, fields=None, **kwargs):
    """Search a journal by ID.

    Args:
        id (str): The ID of the journal. For example, jour.1018957
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """

    if fields is None:
        fields = ""
    query = f'search source_titles where id="{id}" return source_titles{fields}'
    return self.query(query)

search_journal_by_title(self, title, exact_match=True, fields=None, iterative=False, limit=1000, **kwargs)

Search a journal by title.

Parameters:

Name Type Description Default
title str

The title of the journal.

required
exact_match bool

If True, the title must be an exact match. Defaults to True.

True
fields str

The fields to return. For example, [basics+extras]. Defaults to None.

None
iterative bool

If True, the query will be iterative. Defaults to False.

False
limit int

The number of results to return. Defaults to 1000.

1000

Returns:

Type Description
dimcli.DslDataset

JSON data of the results.

Source code in scholarpy/scholarpy.py
def search_journal_by_title(
    self,
    title,
    exact_match=True,
    fields=None,
    iterative=False,
    limit=1000,
    **kwargs,
):
    """Search a journal by title.

    Args:
        title (str): The title of the journal.
        exact_match (bool, optional): If True, the title must be an exact match. Defaults to True.
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """
    if fields is None:
        fields = ""
    query = f'search source_titles for "{title}" return source_titles{fields}'

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    try:
        if exact_match:
            df = result.as_dataframe()
            sub_df = df[df["title"].str.lower() == title.lower()]
            journal_id = sub_df["id"].values.tolist()[0]
            query = f'search source_titles where id="{journal_id}" return source_titles{fields}'
            return self.query(query)

        else:
            return result
    except Exception as e:
        print("No journal can be found.")
        return None

search_orcid_by_name(self, name, fields=None, iterative=False, limit=1000, return_list=False, **kwargs)

Search a researcher orcid by name.

Parameters:

Name Type Description Default
name str

The name of the researcher.

required
fields str

The fields to return. For example, [basics+extras]. Defaults to None.

None
iterative bool

If True, the query will be iterative. Defaults to False.

False
limit int

The number of results to return. Defaults to 1000.

1000
return_list bool

If True, the results will be returned as a list. Defaults to False.

False

Returns:

Type Description
pd.DataFrame

A dataframe of the results.

Source code in scholarpy/scholarpy.py
def search_orcid_by_name(
    self,
    name,
    fields=None,
    iterative=False,
    limit=1000,
    return_list=False,
    **kwargs,
):
    """Search a researcher orcid by name.

    Args:
        name (str): The name of the researcher.
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.
        return_list (bool, optional): If True, the results will be returned as a list. Defaults to False.

    Returns:
        pd.DataFrame: A dataframe of the results.
    """

    result = self.search_researcher_by_name(
        name, fields=fields, iterative=iterative, limit=limit, return_list=False
    )
    df = result.as_dataframe()
    if "orcid_id" in df.columns:
        df = df[~df["orcid_id"].isnull()]
        ids = [id[0] for id in df["orcid_id"].values.tolist()]
        df["orcid_id"] = ids
        # df.sort_values(by=["orcid_id"], inplace=True)
        df["current_research_org_name"] = df["current_research_org.name"]
        df.sort_values("current_research_org_name", inplace=True)
        if return_list:
            df["uid"] = (
                df["first_name"]
                + " "
                + df["last_name"]
                + " | "
                + df["orcid_id"]
                + " | "
                + df["current_research_org.name"]
            )
            df = df[~df["uid"].isnull()]
            return df["uid"].tolist()
        else:
            return df
    else:
        return None

search_org_by_id(self, id, fields=None, **kwargs)

Search an organization by ID. For example, grid.411461.7

Parameters:

Name Type Description Default
id str

The ID of the organization. For example, org.010551261751.12

required
fields str

The fields to return. For example, [basics+extras]. Defaults to None.

None

Returns:

Type Description
dimcli.DslDataset

JSON data of the results.

Source code in scholarpy/scholarpy.py
def search_org_by_id(self, id, fields=None, **kwargs):
    """Search an organization by ID. For example, grid.411461.7

    Args:
        id (str): The ID of the organization. For example, org.010551261751.12
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """
    if fields is None:
        fields = ""
    query = f'search organizations where id="{id}" return organizations{fields}'
    return self.query(query)

search_org_by_name(self, name, exact_match=True, fields=None, iterative=False, limit=1000, return_list=False, **kwargs)

Search an organization by name.

Parameters:

Name Type Description Default
name str

The name of the organization.

required
exact_match bool

If True, the name must be an exact match. Defaults to True.

True
fields str

The fields to return. For example, [basics+extras]. Defaults to None.

None
iterative bool

If True, the query will be iterative. Defaults to False.

False
limit int

The number of results to return. Defaults to 1000.

1000
return_list bool

If True, the results will be returned as a list. Defaults to False.

False

Returns:

Type Description
dimcli.DslDataset

JSON data of the results.

Source code in scholarpy/scholarpy.py
def search_org_by_name(
    self, name, exact_match=True, fields=None, iterative=False, limit=1000, return_list=False, **kwargs
):
    """Search an organization by name.

    Args:
        name (str): The name of the organization.
        exact_match (bool, optional): If True, the name must be an exact match. Defaults to True.
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.
        return_list (bool, optional): If True, the results will be returned as a list. Defaults to False.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """
    if fields is None:
        fields = ""
    query = f'search organizations for "\\"{name}\\"" return organizations{fields}'
    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    try:
        if exact_match:
            df = result.as_dataframe()
            sub_df = df[df["name"].str.lower() == name.lower()]
            org_id = sub_df["id"].values.tolist()[0]
            query = f'search organizations where id="{org_id}" return organizations{fields}'
            result = self.query(query)

        if return_list:

            df = result.as_dataframe()
            if not df.empty:
                df.sort_values("name", inplace=True)
                df["name_id"] = df["id"] + " | " + df["name"]
                orgs = df["name_id"].values.tolist()
                return orgs
            else:
                return None
        else:
            return result

    except Exception as e:
        print("No organization can be found.")
        return None

search_pubs_by_journal_id(self, id, start_year=None, end_year=None, fields=None, iterative=False, limit=1000, **kwargs)

Search publications by journal ID.

Parameters:

Name Type Description Default
id str

The ID of the journal. For example, jour.1018957

required
start_year int

The start year of the publication. Defaults to None.

None
end_year int

The end year of the publication. Defaults to None.

None
fields str

The fields to return. For example, [basics+extras]. Defaults to None.

None
iterative bool

If True, the query will be iterative. Defaults to False.

False
limit int

The number of results to return. Defaults to 1000.

1000

Returns:

Type Description
dimcli.DslDataset

JSON data of the results.

Source code in scholarpy/scholarpy.py
def search_pubs_by_journal_id(
    self,
    id,
    start_year=None,
    end_year=None,
    fields=None,
    iterative=False,
    limit=1000,
    **kwargs,
):
    """Search publications by journal ID.

    Args:
        id (str): The ID of the journal. For example, jour.1018957
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """
    if fields is None:
        fields = ""

    if (start_year is not None) and (end_year is not None):
        query = f'search publications where journal.id="{id}" and year>={start_year} and year<={end_year} return publications{fields}'
    elif start_year is not None:
        query = f'search publications where journal.id="{id}" and year>={start_year} return publications{fields}'
    elif end_year is not None:
        query = f'search publications where journal.id="{id}" and year<={end_year} return publications{fields}'
    else:
        query = f'search publications where journal.id="{id}" return publications{fields}'

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    return result

search_pubs_by_keyword(self, keyword, exact_match=True, scope='title_abstract_only', start_year=None, end_year=None, journal_id=None, fields=None, sorted_field='times_cited', iterative=False, limit=1000, **kwargs)

Search publications by keyword.

Parameters:

Name Type Description Default
keyword str

The keyword to search.

required
exact_match bool

If True, the keyword will be matched exactly. Defaults to True.

True
scope str

The scope of the search. Defaults to "title_abstract_only".

'title_abstract_only'
start_year int

The start year of the publication. Defaults to None.

None
end_year int

The end year of the publication. Defaults to None.

None
fields str

The fields to return. For example, [basics+extras]. Defaults to None.

None
sorted_field str

The field to sort by. Defaults to "times_cited".

'times_cited'
iterative bool

If True, the query will be iterative. Defaults to False.

False
limit int

The number of results to return. Defaults to 1000.

1000

Exceptions:

Type Description
ValueError

[description]

Returns:

Type Description
dimcli.DslDataset

JSON data of the results.

Source code in scholarpy/scholarpy.py
def search_pubs_by_keyword(
    self,
    keyword,
    exact_match=True,
    scope="title_abstract_only",
    start_year=None,
    end_year=None,
    journal_id=None,
    fields=None,
    sorted_field="times_cited",
    iterative=False,
    limit=1000,
    **kwargs,
):
    """Search publications by keyword.

    Args:
        keyword (str): The keyword to search.
        exact_match (bool, optional): If True, the keyword will be matched exactly. Defaults to True.
        scope (str, optional): The scope of the search. Defaults to "title_abstract_only".
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
        sorted_field (str, optional): The field to sort by. Defaults to "times_cited".
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.

    Raises:
        ValueError: [description]

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """
    if exact_match:
        exact = '\\"'
    else:
        exact = ""

    if journal_id is not None:
        journal = f'and journal.id="{journal_id}"'
    else:
        journal = ""

    allowed_scopes = [
        "authors",
        "concepts",
        "full_data",
        "full_data_exact",
        "title_abstract_only",
        "title_only",
    ]

    if scope not in allowed_scopes:
        raise ValueError(f"scope must be one of {allowed_scopes}")

    if fields is None:
        fields = "[basics+altmetric+times_cited+field_citation_ratio+authors_count+doi+dimensions_url]"

    if (start_year is not None) and (end_year is not None):
        query = f'search publications in {scope} for "{exact}{keyword}{exact}" where year>={start_year} and year<={end_year} {journal} return publications{fields} sort by {sorted_field}'
    elif start_year is not None:
        query = f'search publications in {scope} for "{exact}{keyword}{exact}" where year>={start_year} {journal} return publications{fields} sort by {sorted_field}'
    elif end_year is not None:
        query = f'search publications in {scope} for "{exact}{keyword}{exact}" where year<={end_year} {journal} return publications{fields} sort by {sorted_field}'
    else:
        if journal_id is not None:
            journal = f'where journal.id="{journal_id}"'
        else:
            journal = ""
        query = f'search publications in {scope} for "{exact}{keyword}{exact}" {journal} return publications{fields} sort by {sorted_field}'

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    return result

search_pubs_by_org_id(self, id, start_year=None, end_year=None, fields=None, iterative=False, limit=1000, **kwargs)

Search publications by organization ID.

Parameters:

Name Type Description Default
id str

The ID of the organization. For example, grid.411461.7

required
start_year int

The start year of the publication. Defaults to None.

None
end_year int

The end year of the publication. Defaults to None.

None
fields str

The fields to return. For example, [basics+extras]. Defaults to None.

None
iterative bool

If True, the query will be iterative. Defaults to False.

False
limit int

The number of results to return. Defaults to 1000.

1000

Returns:

Type Description
dimcli.DslDataset

JSON data of the results.

Source code in scholarpy/scholarpy.py
def search_pubs_by_org_id(
    self,
    id,
    start_year=None,
    end_year=None,
    fields=None,
    iterative=False,
    limit=1000,
    **kwargs,
):
    """Search publications by organization ID.

    Args:
        id (str): The ID of the organization. For example, grid.411461.7
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """
    if fields is None:
        fields = ""

    if (start_year is not None) and (end_year is not None):
        query = f'search publications where research_orgs="{id}" and year>={start_year} and year<={end_year} return publications{fields}'
    elif start_year is not None:
        query = f'search publications where research_orgs="{id}" and year>={start_year} return publications{fields}'
    elif end_year is not None:
        query = f'search publications where research_orgs="{id}" and year<={end_year} return publications{fields}'
    else:
        query = f'search publications where research_orgs="{id}" return publications{fields}'

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    return result

search_pubs_by_researcher_id(self, id, start_year=None, end_year=None, fields=None, iterative=False, limit=1000, **kwargs)

Search publications by researcher ID.

Parameters:

Name Type Description Default
id str

The ID of the researcher. For example, res.010551261751.12

required
start_year int

The start year of the publication. Defaults to None.

None
end_year int

The end year of the publication. Defaults to None.

None
fields str

The fields to return. For example, [basics+extras]. Defaults to None.

None
iterative bool

If True, the query will be iterative. Defaults to False.

False
limit int

The number of results to return. Defaults to 1000.

1000

Returns:

Type Description
dimcli.DslDataset

JSON data of the results.

Source code in scholarpy/scholarpy.py
def search_pubs_by_researcher_id(
    self,
    id,
    start_year=None,
    end_year=None,
    fields=None,
    iterative=False,
    limit=1000,
    **kwargs,
):
    """Search publications by researcher ID.

    Args:
        id (str): The ID of the researcher. For example, res.010551261751.12
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """
    if fields is None:
        fields = "[basics+authors_count+times_cited+dimensions_url]"

    if (start_year is not None) and (end_year is not None):
        query = f'search publications where researchers.id="{id}" and year>={start_year} and year<={end_year} return publications{fields}'
    elif start_year is not None:
        query = f'search publications where researchers.id="{id}" and year>={start_year} return publications{fields}'
    elif end_year is not None:
        query = f'search publications where researchers.id="{id}" and year<={end_year} return publications{fields}'
    else:
        query = f'search publications where researchers.id="{id}" return publications{fields}'

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    return result

search_researcher_by_id(self, id, fields=None, iterative=False, limit=1000, return_df=False, **kwargs)

Search a researcher by ID.

Parameters:

Name Type Description Default
id str

The ID of the researcher. For example, ur.010551261751.12

required
fields str

The fields to return. For example, [basics+extras]. Defaults to None.

None
iterative bool

If True, the query will be iterative. Defaults to False.

False
limit int

The number of results to return. Defaults to 1000.

1000
return_df bool

If True, the results will be returned as a dataframe. Defaults to False.

False

Returns:

Type Description
dimcli.DslDataset

JSON data of the results.

Source code in scholarpy/scholarpy.py
def search_researcher_by_id(
    self,
    id,
    fields=None,
    iterative=False,
    limit=1000,
    return_df=False,
    **kwargs,
):
    """Search a researcher by ID.

    Args:
        id (str): The ID of the researcher. For example, ur.010551261751.12
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.
        return_df (bool, optional): If True, the results will be returned as a dataframe. Defaults to False.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """
    """Search a researcher by a Dimensions Research ID. For example, ur.010551261751.12"""

    if fields is None:
        fields = "[basics+extras]"

    query = f'search researchers where id="{id}" return researchers{fields}'

    if iterative:
        result = self.query_iterative(query, limit=limit, **kwargs)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)
    if return_df:
        df2 = result.as_dataframe().transpose()
        df3 = pd.DataFrame(
            {"index": df2.index.tolist(), "value": df2.values.tolist()}
        )
        df3["value"] = df3["value"]
        print(type(df3["value"]))
        return df3
    else:
        return result

search_researcher_by_name(self, name, fields=None, iterative=False, limit=1000, return_list=False, **kwargs)

Search a researcher by name.

Parameters:

Name Type Description Default
name str

The name of the researcher.

required
fields str

The fields to return. For example, [basics+extras]. Defaults to None.

None
iterative bool

If True, the query will be iterative. Defaults to False.

False
limit int

The number of results to return. Defaults to 1000.

1000
return_list bool

If True, the results will be returned as a list. Defaults to False.

False

Returns:

Type Description
dimcli.DslDataset

JSON data of the results.

Source code in scholarpy/scholarpy.py
def search_researcher_by_name(
    self,
    name,
    fields=None,
    iterative=False,
    limit=1000,
    return_list=False,
    **kwargs,
):
    """Search a researcher by name.

    Args:
        name (str): The name of the researcher.
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.
        return_list (bool, optional): If True, the results will be returned as a list. Defaults to False.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """

    if fields is None:
        fields = "[basics+extras]"

    query = f'search researchers for "\\"{name}\\"" where obsolete=0 and total_publications>0 return researchers{fields}'
    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    if return_list:
        df = result.as_dataframe()
        if not df.empty:
            # df.sort_values(by=["id"], inplace=True)
            df["current_research_org_name"] = df["current_research_org.name"]
            df.sort_values("current_research_org_name", inplace=True)
            if not df.empty:
                items = []
                for row in df.itertuples():
                    item = (
                        str(row.first_name)
                        + " "
                        + str(row.last_name)
                        + " | "
                        + str(row.id)
                        + " | "
                        + str(row.current_research_org_name)
                    )
                    items.append(item)

                return result, items
        else:
            return result, None
    else:
        return result

search_researcher_collaborators(self, id, pubs=None)

Search collaborators of a researcher.

Parameters:

Name Type Description Default
id str

The ID of the researcher. For example, ur.010551261751.12

required
pubs dimcli.DslDataset

The publications of the researcher. Defaults to None.

None

Returns:

Type Description
pd.DataFrame

A dataframe of the collaborators.

Source code in scholarpy/scholarpy.py
def search_researcher_collaborators(self, id, pubs=None):
    """Search collaborators of a researcher.

    Args:
        id (str): The ID of the researcher. For example, ur.010551261751.12
        pubs (dimcli.DslDataset, optional): The publications of the researcher. Defaults to None.
    Returns:
        pd.DataFrame: A dataframe of the collaborators.
    """
    if pubs is None:
        pubs = self.search_pubs_by_researcher_id(id)
    df = pubs.as_dataframe_authors()
    df = df[df["researcher_id"] != id]
    df["Name"] = df["first_name"].str.split(
        " ").str[0] + " " + df["last_name"]
    names = df.drop_duplicates("Name").copy()
    affiliations = names["affiliations"].values.tolist()
    institutions = []
    for a in enumerate(affiliations):
        try:
            institution = a[1][0]["name"]
            institutions.append(institution)
        except:
            institutions.append("")

    names["Institution"] = institutions
    names = names[["Name", "Institution"]]

    result = pd.DataFrame(df["Name"].value_counts())
    result = pd.DataFrame(
        {"Name": result.index, "Count": result["Name"].values})
    return result.merge(names, on="Name")

annual_stats_barplot(df, columns=None, **kwargs)

Get a barplot of the annual stats.

Parameters:

Name Type Description Default
df pd.DataFrame

The dataframe of the results. Can be derived from df, _ = dsl.researcher_annual_stats().

required
columns list

The columns to plot. Defaults to None.

None

Returns:

Type Description
px.Bar

The barplot.

Source code in scholarpy/scholarpy.py
def annual_stats_barplot(df, columns=None, **kwargs):
    """Get a barplot of the annual stats.

    Args:
        df (pd.DataFrame): The dataframe of the results. Can be derived from df, _ = dsl.researcher_annual_stats().
        columns (list, optional): The columns to plot. Defaults to None.

    Returns:
        px.Bar: The barplot.
    """
    if columns is None:
        columns = ["pubs", "collaborators", "institutions", "cities"]
    fig = px.bar(
        df,
        x="year",
        y=columns,
        barmode="group",
    )
    return fig

collaborator_locations(df)

Get the locations of collaborators.

Parameters:

Name Type Description Default
df pd.DataFrame

The dataframe of collaborators, can be derived from _, df = dsl.researcher_annual_stats().

required

Returns:

Type Description
pd.DataFrame

The dataframe of the results.

Source code in scholarpy/scholarpy.py
def collaborator_locations(df):
    """Get the locations of collaborators.

    Args:
        df (pd.DataFrame): The dataframe of collaborators, can be derived from _, df = dsl.researcher_annual_stats().

    Returns:
        pd.DataFrame: The dataframe of the results.
    """
    if "name" in df.columns:
        df.drop(columns=["name"], axis=1, inplace=True)

    if "year" in df.columns:
        df.drop(columns=["year"], axis=1, inplace=True)

    if "index" in df.columns:
        df.drop(columns=["index"], axis=1, inplace=True)

    df.drop_duplicates(inplace=True)
    df = df[df["latitude"] != 0].reset_index()
    df.drop(columns=["index"], axis=1, inplace=True)

    return df

geoname_latlon(id, df=None)

Get the latitude and longitude of a city based on a GeoNmae city id.

Parameters:

Name Type Description Default
id str

The GeoName city id.

required
df pd.DataFrame

The geonames dataframe. Defaults to None.

None

Returns:

Type Description
tuple

The latitude and longitude of the city.

Source code in scholarpy/scholarpy.py
def geoname_latlon(id, df=None):
    """Get the latitude and longitude of a city based on a GeoNmae city id.

    Args:
        id (str): The GeoName city id.
        df (pd.DataFrame, optional): The geonames dataframe. Defaults to None.

    Returns:
        tuple: The latitude and longitude of the city.
    """
    if not isinstance(id, int):
        try:
            id = int(id)
        except:
            raise ValueError("id must be an integer")

    if df is None:
        df = get_geonames()

    row = df[df["geonameid"] == id]
    lat = 0
    lon = 0
    if not row.empty:
        lat = row.iloc[0]["latitude"]
        lon = row.iloc[0]["longitude"]

    return lat, lon

get_geonames(**kwargs)

Get the geonames dataframe.

Returns:

Type Description
pd.DataFrame

The dataframe of the results.

Source code in scholarpy/scholarpy.py
def get_geonames(**kwargs):
    """Get the geonames dataframe.

    Returns:
        pd.DataFrame: The dataframe of the results.
    """
    url = "https://raw.githubusercontent.com/giswqs/data/main/world/cities5000.csv"
    # columns = ['geonameid', 'name', 'asciiname', 'alternatenames', 'latitude', 'longitude', 'feature_class', 'feature_code', 'country_code',
    #            'cc2', 'admin1_code', 'admin2_code', 'admin3_code', 'admin4_code', 'population', 'elevation', 'dem', 'timezone', 'modification_date']
    df = pd.read_csv(url, sep="\t", encoding="utf-8")

    if "columns" in kwargs and isinstance(kwargs["columns"], list):
        df = df[kwargs["columns"]]
    else:
        df = df[
            ["geonameid", "name", "country_code",
                "population", "latitude", "longitude"]
        ]

    return df

json_to_df(json_data, transpose=False)

Convert a json file to a dataframe.

Parameters:

Name Type Description Default
json_data json

The json data.

required
transpose bool

If True, transpose the dataframe. Defaults to False.

False

Returns:

Type Description
pd.DataFrame

The dataframe of the results.

Source code in scholarpy/scholarpy.py
def json_to_df(json_data, transpose=False):
    """Convert a json file to a dataframe.

    Args:
        json_data (json): The json data.
        transpose (bool, optional): If True, transpose the dataframe. Defaults to False.

    Returns:
        pd.DataFrame: The dataframe of the results.
    """
    df = json_data.as_dataframe()
    if not df.empty:
        if transpose:
            df = df.transpose()

        out_csv = leafmap.temp_file_path(".csv")
        df.to_csv(out_csv, index=transpose)
        df = pd.read_csv(out_csv)
        os.remove(out_csv)
        return df
    else:
        return None

Last update: 2022-02-09