scholarpy module¶

Main module.

`Dsl (Dsl)` ¶

Source code in scholarpy/scholarpy.py

class Dsl(dimcli.Dsl):
    def __init__(self, key=None, **kwargs):
        """Initialize the DSL."""
        if key is None:
            key = os.environ.get("DIM_TOKEN")
            if key is None:
                webbrowser.open("https://api-lab.dimensions.ai")
                raise ValueError(
                    "No Dimensions API key can be found. Please go to https://www.dimensions.ai/contact-us to request an API key."
                )
            dimcli.login(key=key)

        super().__init__(**kwargs)

    def search_researcher_by_id(
        self,
        id,
        fields=None,
        iterative=False,
        limit=1000,
        return_df=False,
        **kwargs,
    ):
        """Search a researcher by ID.

        Args:
            id (str): The ID of the researcher. For example, ur.010551261751.12
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.
            return_df (bool, optional): If True, the results will be returned as a dataframe. Defaults to False.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """
        """Search a researcher by a Dimensions Research ID. For example, ur.010551261751.12"""

        if fields is None:
            fields = "[basics+extras]"

        query = f'search researchers where id="{id}" return researchers{fields}'

        if iterative:
            result = self.query_iterative(query, limit=limit, **kwargs)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)
        if return_df:
            df2 = result.as_dataframe().transpose()
            df3 = pd.DataFrame(
                {"index": df2.index.tolist(), "value": df2.values.tolist()}
            )
            df3["value"] = df3["value"]
            print(type(df3["value"]))
            return df3
        else:
            return result

    def search_researcher_by_name(
        self,
        name,
        fields=None,
        iterative=False,
        limit=1000,
        return_list=False,
        **kwargs,
    ):
        """Search a researcher by name.

        Args:
            name (str): The name of the researcher.
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.
            return_list (bool, optional): If True, the results will be returned as a list. Defaults to False.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """

        if fields is None:
            fields = "[basics+extras]"

        query = f'search researchers for "\\"{name}\\"" where obsolete=0 and total_publications>0 return researchers{fields}'
        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        if return_list:
            df = result.as_dataframe()
            if not df.empty:
                # df.sort_values(by=["id"], inplace=True)
                df["current_research_org_name"] = df["current_research_org.name"]
                df.sort_values("current_research_org_name", inplace=True)
                if not df.empty:
                    items = []
                    for row in df.itertuples():
                        item = (
                            str(row.first_name)
                            + " "
                            + str(row.last_name)
                            + " | "
                            + str(row.id)
                            + " | "
                            + str(row.current_research_org_name)
                        )
                        items.append(item)

                    return result, items
            else:
                return result, None
        else:
            return result

    def search_researcher_collaborators(self, id, pubs=None):
        """Search collaborators of a researcher.

        Args:
            id (str): The ID of the researcher. For example, ur.010551261751.12
            pubs (dimcli.DslDataset, optional): The publications of the researcher. Defaults to None.
        Returns:
            pd.DataFrame: A dataframe of the collaborators.
        """
        if pubs is None:
            pubs = self.search_pubs_by_researcher_id(id)
        df = pubs.as_dataframe_authors()
        df = df[df["researcher_id"] != id]
        df["Name"] = df["first_name"].str.split(
            " ").str[0] + " " + df["last_name"]
        names = df.drop_duplicates("Name").copy()
        affiliations = names["affiliations"].values.tolist()
        institutions = []
        for a in enumerate(affiliations):
            try:
                institution = a[1][0]["name"]
                institutions.append(institution)
            except:
                institutions.append("")

        names["Institution"] = institutions
        names = names[["Name", "Institution"]]

        result = pd.DataFrame(df["Name"].value_counts())
        result = pd.DataFrame(
            {"Name": result.index, "Count": result["Name"].values})
        return result.merge(names, on="Name")

    def search_orcid_by_name(
        self,
        name,
        fields=None,
        iterative=False,
        limit=1000,
        return_list=False,
        **kwargs,
    ):
        """Search a researcher orcid by name.

        Args:
            name (str): The name of the researcher.
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.
            return_list (bool, optional): If True, the results will be returned as a list. Defaults to False.

        Returns:
            pd.DataFrame: A dataframe of the results.
        """

        result = self.search_researcher_by_name(
            name, fields=fields, iterative=iterative, limit=limit, return_list=False
        )
        df = result.as_dataframe()
        if "orcid_id" in df.columns:
            df = df[~df["orcid_id"].isnull()]
            ids = [id[0] for id in df["orcid_id"].values.tolist()]
            df["orcid_id"] = ids
            # df.sort_values(by=["orcid_id"], inplace=True)
            df["current_research_org_name"] = df["current_research_org.name"]
            df.sort_values("current_research_org_name", inplace=True)
            if return_list:
                df["uid"] = (
                    df["first_name"]
                    + " "
                    + df["last_name"]
                    + " | "
                    + df["orcid_id"]
                    + " | "
                    + df["current_research_org.name"]
                )
                df = df[~df["uid"].isnull()]
                return df["uid"].tolist()
            else:
                return df
        else:
            return None

    def search_journal_by_id(self, id, fields=None, **kwargs):
        """Search a journal by ID.

        Args:
            id (str): The ID of the journal. For example, jour.1018957
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """

        if fields is None:
            fields = ""
        query = f'search source_titles where id="{id}" return source_titles{fields}'
        return self.query(query)

    def search_journal_by_title(
        self,
        title,
        exact_match=True,
        fields=None,
        iterative=False,
        limit=1000,
        **kwargs,
    ):
        """Search a journal by title.

        Args:
            title (str): The title of the journal.
            exact_match (bool, optional): If True, the title must be an exact match. Defaults to True.
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """
        if fields is None:
            fields = ""
        query = f'search source_titles for "{title}" return source_titles{fields}'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        try:
            if exact_match:
                df = result.as_dataframe()
                sub_df = df[df["title"].str.lower() == title.lower()]
                journal_id = sub_df["id"].values.tolist()[0]
                query = f'search source_titles where id="{journal_id}" return source_titles{fields}'
                return self.query(query)

            else:
                return result
        except Exception as e:
            print("No journal can be found.")
            return None

    def search_org_by_id(self, id, fields=None, **kwargs):
        """Search an organization by ID. For example, grid.411461.7

        Args:
            id (str): The ID of the organization. For example, org.010551261751.12
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """
        if fields is None:
            fields = ""
        query = f'search organizations where id="{id}" return organizations{fields}'
        return self.query(query)

    def search_org_by_name(
        self, name, exact_match=True, fields=None, iterative=False, limit=1000, return_list=False, **kwargs
    ):
        """Search an organization by name.

        Args:
            name (str): The name of the organization.
            exact_match (bool, optional): If True, the name must be an exact match. Defaults to True.
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.
            return_list (bool, optional): If True, the results will be returned as a list. Defaults to False.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """
        if fields is None:
            fields = ""
        query = f'search organizations for "\\"{name}\\"" return organizations{fields}'
        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        try:
            if exact_match:
                df = result.as_dataframe()
                sub_df = df[df["name"].str.lower() == name.lower()]
                org_id = sub_df["id"].values.tolist()[0]
                query = f'search organizations where id="{org_id}" return organizations{fields}'
                result = self.query(query)

            if return_list:

                df = result.as_dataframe()
                if not df.empty:
                    df.sort_values("name", inplace=True)
                    df["name_id"] = df["id"] + " | " + df["name"]
                    orgs = df["name_id"].values.tolist()
                    return orgs
                else:
                    return None
            else:
                return result

        except Exception as e:
            print("No organization can be found.")
            return None

    def search_pubs_by_researcher_id(
        self,
        id,
        start_year=None,
        end_year=None,
        fields=None,
        iterative=False,
        limit=1000,
        **kwargs,
    ):
        """Search publications by researcher ID.

        Args:
            id (str): The ID of the researcher. For example, res.010551261751.12
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """
        if fields is None:
            fields = "[basics+authors_count+times_cited+dimensions_url]"

        if (start_year is not None) and (end_year is not None):
            query = f'search publications where researchers.id="{id}" and year>={start_year} and year<={end_year} return publications{fields}'
        elif start_year is not None:
            query = f'search publications where researchers.id="{id}" and year>={start_year} return publications{fields}'
        elif end_year is not None:
            query = f'search publications where researchers.id="{id}" and year<={end_year} return publications{fields}'
        else:
            query = f'search publications where researchers.id="{id}" return publications{fields}'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        return result

    def search_pubs_by_journal_id(
        self,
        id,
        start_year=None,
        end_year=None,
        fields=None,
        iterative=False,
        limit=1000,
        **kwargs,
    ):
        """Search publications by journal ID.

        Args:
            id (str): The ID of the journal. For example, jour.1018957
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """
        if fields is None:
            fields = ""

        if (start_year is not None) and (end_year is not None):
            query = f'search publications where journal.id="{id}" and year>={start_year} and year<={end_year} return publications{fields}'
        elif start_year is not None:
            query = f'search publications where journal.id="{id}" and year>={start_year} return publications{fields}'
        elif end_year is not None:
            query = f'search publications where journal.id="{id}" and year<={end_year} return publications{fields}'
        else:
            query = f'search publications where journal.id="{id}" return publications{fields}'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        return result

    def search_pubs_by_org_id(
        self,
        id,
        start_year=None,
        end_year=None,
        fields=None,
        iterative=False,
        limit=1000,
        **kwargs,
    ):
        """Search publications by organization ID.

        Args:
            id (str): The ID of the organization. For example, grid.411461.7
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """
        if fields is None:
            fields = ""

        if (start_year is not None) and (end_year is not None):
            query = f'search publications where research_orgs="{id}" and year>={start_year} and year<={end_year} return publications{fields}'
        elif start_year is not None:
            query = f'search publications where research_orgs="{id}" and year>={start_year} return publications{fields}'
        elif end_year is not None:
            query = f'search publications where research_orgs="{id}" and year<={end_year} return publications{fields}'
        else:
            query = f'search publications where research_orgs="{id}" return publications{fields}'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        return result

    def search_pubs_by_keyword(
        self,
        keyword,
        exact_match=True,
        scope="title_abstract_only",
        start_year=None,
        end_year=None,
        journal_id=None,
        fields=None,
        sorted_field="times_cited",
        iterative=False,
        limit=1000,
        **kwargs,
    ):
        """Search publications by keyword.

        Args:
            keyword (str): The keyword to search.
            exact_match (bool, optional): If True, the keyword will be matched exactly. Defaults to True.
            scope (str, optional): The scope of the search. Defaults to "title_abstract_only".
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
            sorted_field (str, optional): The field to sort by. Defaults to "times_cited".
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.

        Raises:
            ValueError: [description]

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """
        if exact_match:
            exact = '\\"'
        else:
            exact = ""

        if journal_id is not None:
            journal = f'and journal.id="{journal_id}"'
        else:
            journal = ""

        allowed_scopes = [
            "authors",
            "concepts",
            "full_data",
            "full_data_exact",
            "title_abstract_only",
            "title_only",
        ]

        if scope not in allowed_scopes:
            raise ValueError(f"scope must be one of {allowed_scopes}")

        if fields is None:
            fields = "[basics+altmetric+times_cited+field_citation_ratio+authors_count+doi+dimensions_url]"

        if (start_year is not None) and (end_year is not None):
            query = f'search publications in {scope} for "{exact}{keyword}{exact}" where year>={start_year} and year<={end_year} {journal} return publications{fields} sort by {sorted_field}'
        elif start_year is not None:
            query = f'search publications in {scope} for "{exact}{keyword}{exact}" where year>={start_year} {journal} return publications{fields} sort by {sorted_field}'
        elif end_year is not None:
            query = f'search publications in {scope} for "{exact}{keyword}{exact}" where year<={end_year} {journal} return publications{fields} sort by {sorted_field}'
        else:
            if journal_id is not None:
                journal = f'where journal.id="{journal_id}"'
            else:
                journal = ""
            query = f'search publications in {scope} for "{exact}{keyword}{exact}" {journal} return publications{fields} sort by {sorted_field}'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        return result

    def search_grants_by_keyword(
        self,
        keyword,
        exact_match=True,
        scope="title_only",
        start_year=None,
        end_year=None,
        fields=None,
        sorted_field="start_year",
        iterative=False,
        limit=1000,
        **kwargs,
    ):

        if exact_match:
            exact = '\\"'
        else:
            exact = ""

        allowed_scopes = [
            "concepts",
            "full_data",
            "investigators",
            "title_abstract_only",
            "title_only",
        ]

        if scope not in allowed_scopes:
            raise ValueError(f"scope must be one of {allowed_scopes}")

        if fields is None:
            fields = "[basics+extras+dimensions_url]"

        if end_year is not None:
            end_year = str(end_year)
            if len(str(end_year)) != 4:
                raise ValueError("end_year must be a 4-digit number")
            else:
                end_year = str(end_year) + "-12-31"

        if (start_year is not None) and (end_year is not None):
            query = f'search grants in {scope} for "{exact}{keyword}{exact}" where start_year>={start_year} and end_date<="{end_year}" return grants{fields} sort by {sorted_field}'
        elif start_year is not None:
            query = f'search grants in {scope} for "{exact}{keyword}{exact}" where start_year>={start_year} return grants{fields} sort by {sorted_field}'
        elif end_year is not None:
            query = f'search grants in {scope} for "{exact}{keyword}{exact}" where end_date<="{end_year}" return grants{fields} sort by {sorted_field}'
        else:
            query = f'search grants in {scope} for "{exact}{keyword}{exact}" return grants{fields} sort by {sorted_field}'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        return result

    def h_index(self, id, iterative=False, limit=1000):
        """Get the h-index of a researcher.

        Args:
            id (str): The ID of the researcher. For example, res.010551261751.12
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.
        """

        def the_H_function(sorted_citations_list, n=1):
            """from a list of integers [n1, n2 ..] representing publications citations,
            return the max list-position which is >= integer

            eg
            >>> the_H_function([10, 8, 5, 4, 3]) => 4
            >>> the_H_function([25, 8, 5, 3, 3]) => 3
            >>> the_H_function([1000, 20]) => 2
            """
            if sorted_citations_list and sorted_citations_list[0] >= n:
                return the_H_function(sorted_citations_list[1:], n + 1)
            else:
                return n - 1

        def get_pubs_citations(researcher_id, iterative=False, limit=1000):
            q = f'search publications where researchers.id = "{researcher_id}" return publications[times_cited] sort by times_cited'

            if iterative:
                result = self.query_iterative(q, limit=limit)
            else:
                q = f"{q} limit {limit}"
                result = self.query(q)

            return list(result.as_dataframe().fillna(0)["times_cited"])

        return the_H_function(get_pubs_citations(id, iterative=iterative, limit=limit))

    def researcher_pubs_stats(
        self,
        id,
        start_year=None,
        end_year=None,
        iterative=False,
        limit=1000,
        return_plot=False,
        **kwargs,
    ):
        """Get the publications stats of a researcher.

        Args:
            id (str): The ID of the researcher. For example, ur.010551261751.12
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.
            return_plot (bool, optional): If True, the plot will be returned. Defaults to False.

        Returns:
            pandas.DataFrame: The dataframe of the results.
        """

        result = self.search_pubs_by_researcher_id(
            id, start_year, end_year, iterative=iterative, limit=limit
        )
        pubs = result.as_dataframe()
        df = pubs["year"].value_counts().sort_index()
        df2 = pd.DataFrame({"year": df.index, "citations": df.values})
        if return_plot:
            return df.plot.bar(**kwargs)
        else:
            return df2

    def researcher_pubs_authors(
        self, id, start_year=None, end_year=None, iterative=False, limit=1000
    ):
        """Get the authors of a researcher's publications.

        Args:
            id (str): The ID of the researcher. For example, ur.010551261751.12
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.

        Returns:
            pandas.DataFrame: The dataframe of the results.
        """

        result = self.search_pubs_by_researcher_id(
            id, start_year, end_year, iterative=iterative, limit=limit
        )
        pubs = result.as_dataframe()
        authors = result.as_dataframe_authors()["pub_id"].value_counts()
        df2 = pd.DataFrame({"id": authors.index, "authors": authors.values})
        df = pubs.join(df2.set_index("id"), on="id")
        return df

    def researcher_annual_stats(self, data, geonames_df=None):
        """Get the annual stats of a researcher, including publications, collaborators, collaborating institutions, and cities.

        Args:
            data (dimcli.DslDataset): JSON data of the input. It can be derived from search_pubs_by_researcher_id().
            geonames_df (pd.DataFrame, optional): The geonames dataframe. Defaults to None.

        Returns:
            pd.DataFrame: The dataframe of the results.
        """
        pubs = data.as_dataframe()
        years_dict = pubs[["id", "year"]].set_index("id").to_dict()["year"]
        df = data.as_dataframe_authors()
        df["name"] = df["first_name"] + " " + df["last_name"]
        affiliations = df["affiliations"].values.tolist()

        institutions = []
        city_ids = []
        years = []
        cities = []
        countries = []
        latitudes = []
        longitudes = []

        ids = df["pub_id"].values.tolist()
        for index, a in enumerate(affiliations):
            try:
                institution = a[0]["name"]
                institutions.append(institution)
            except:
                institutions.append("")

            try:
                city_id = a[0]["city_id"]
                city_ids.append(city_id)
            except:
                city_ids.append(0)

            try:
                city = a[0]["city"]
                cities.append(city)
            except:
                cities.append("")

            try:
                country = a[0]["country"]
                countries.append(country)
            except:
                countries.append("")

            if geonames_df is not None:
                try:
                    latitude, longitude = geoname_latlon(
                        city_ids[-1], geonames_df)
                    latitudes.append(latitude)
                    longitudes.append(longitude)
                except:
                    latitudes.append(0)
                    longitudes.append(0)

            years.append((years_dict[ids[index]]))

        df["year"] = years
        df["institution"] = institutions
        df["city"] = cities
        df["country"] = countries
        df["city_id"] = city_ids

        if geonames_df is not None:
            df["latitude"] = latitudes
            df["longitude"] = longitudes

        pubs_stats = pubs.groupby("year").size()
        collaborators_stats = (
            df.groupby(["year", "name"]).size().groupby(level=0).size()
        )
        institutions_stats = (
            df.groupby(["year", "institution"]).size().groupby(level=0).size()
        )
        cities_stats = df.groupby(
            ["year", "city_id"]).size().groupby(level=0).size()

        df2 = pd.DataFrame(
            {
                "year": cities_stats.index,
                "pubs": pubs_stats,
                "collaborators": collaborators_stats,
                "institutions": institutions_stats,
                "cities": cities_stats,
            }
        )
        if geonames_df is None:
            return df2
        else:
            return (
                df2,
                df[
                    [
                        "name",
                        "year",
                        "institution",
                        "city",
                        "country",
                        "latitude",
                        "longitude",
                    ]
                ],
            )

    def org_pubs_annual_stats(self, org_id, start_year=None, end_year=None, iterative=False, limit=1000, return_plot=False, **kwargs):
        """Search publications by organization ID.

        Args:
            org_id (str): The ID of the organization. For example, grid.411461.7
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.
            return_plot (bool, optional): If True, the plot of the results will be returned. Defaults to False.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """

        if (start_year is not None) and (end_year is not None):
            query = f'search publications where research_orgs="{org_id}" and year>={start_year} and year<={end_year} return year'
        elif start_year is not None:
            query = f'search publications where research_orgs="{org_id}" and year>={start_year} return year'
        elif end_year is not None:
            query = f'search publications where research_orgs="{org_id}" and year<={end_year} return year'
        else:
            query = f'search publications where research_orgs="{org_id}" return year'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        df = result.as_dataframe()
        df.rename(columns={"id": "year"}, inplace=True)

        if not return_plot:
            return df
        else:
            if not df.empty:
                org_name = self.search_org_by_id(
                    org_id).as_dataframe()["name"][0]
                fig = px.bar(df, x="year", y="count",
                             title=f"Publications from {org_name} - by year")
                return df, fig
            else:
                return df, None

    def org_grants_annual_stats(self, org_id, start_year=None, end_year=None, iterative=False, limit=100, return_plot=False, **kwargs):
        """Search publications by organization ID.

        Args:
            org_id (str): The ID of the organization. For example, grid.411461.7
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.
            return_plot (bool, optional): If True, the plot of the results will be returned. Defaults to False.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """

        if (start_year is not None) and (end_year is not None):
            # query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} and end_date<="{end_year}-12-31" return start_year aggregate funding'
            query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} return start_year aggregate funding'
        elif start_year is not None:
            query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} return start_year aggregate funding'
        elif end_year is not None:
            query = f'search grants where research_orgs="{org_id}" return start_year aggregate funding'
        else:
            query = f'search grants where research_orgs="{org_id}" return start_year aggregate funding'

        # if limit is None and start_year is not None and end_year is not None:
        #     limit = end_year - start_year + 1
        # else:
        #     limit = 30

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        df = result.as_dataframe()
        df.rename(columns={"id": "year"}, inplace=True)

        if not return_plot:
            return df
        else:
            if not df.empty:
                org_name = self.search_org_by_id(
                    org_id).as_dataframe()["name"][0]
                fig_count = px.bar(df, x="year", y="count",
                                   title=f"The number of grants for {org_name} - by year")
                fig_amount = px.bar(df, x="year", y="funding",
                                    title=f"The funding amount for {org_name} - by year")

                return df, fig_count, fig_amount
            else:
                return df, None, None

    def org_grant_funders(self, org_id, start_year=None, end_year=None, iterative=False, limit=20, return_plot=False, **kwargs):
        """Top funders of an organization.

        Args:
            org_id (str): The ID of the organization. For example, grid.411461.7
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 20.
            return_plot (bool, optional): If True, the plot of the results will be returned. Defaults to False.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """

        if (start_year is not None) and (end_year is not None):
            query = f'search grants where research_orgs="{org_id}" and year>={start_year} and year<={end_year} return funders aggregate funding sort by funding'
        elif start_year is not None:
            query = f'search grants where research_orgs="{org_id}" and year>={start_year} return funders aggregate funding sort by funding'
        elif end_year is not None:
            query = f'search grants where research_orgs="{org_id}" and year<={end_year} return funders aggregate funding sort by funding'
        else:
            query = f'search grants where research_orgs="{org_id}" return funders aggregate funding sort by funding'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        df = result.as_dataframe()
        if not return_plot:
            return df
        else:
            if not df.empty:
                org_name = self.search_org_by_id(
                    org_id).as_dataframe()["name"][0]
                fig = px.bar(df,
                             x="name", y="funding",
                             title=f"Funding for {org_name} - by funder")
                return df, fig
            else:
                return df, None

    def search_grants_by_org(self, org_id, start_year=None, end_year=None, iterative=False, limit=1000, **kwargs):
        """Search grants by organization id.

        Args:
            org_id (str): The ID of the organization. For example, grid.411461.7
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 20.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """

        if (start_year is not None) and (end_year is not None):
            query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} return grants[basics+extras]'
        elif start_year is not None:
            query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} return grants[basics+extras]'
        elif end_year is not None:
            query = f'search grants where research_orgs="{org_id}" and start_year<={end_year} return grants[basics+extras]'
        else:
            query = f'search grants where research_orgs="{org_id}" return grants[basics+extras]'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        return result

    def search_grants_by_researcher(self, researcher_id, start_year=None, end_year=None, iterative=False, limit=1000, **kwargs):
        """Search grants by researcher id.

        Args:
            researcher_id (str): The ID of the research. For example, ur.01361677540.55
            start_year (int, optional): The start year of the publication. Defaults to None.
            end_year (int, optional): The end year of the publication. Defaults to None.
            iterative (bool, optional): If True, the query will be iterative. Defaults to False.
            limit (int, optional): The number of results to return. Defaults to 1000.

        Returns:
            dimcli.DslDataset: JSON data of the results.
        """

        if (start_year is not None) and (end_year is not None):
            query = f'search grants where researchers="{researcher_id}" and start_year>={start_year} return grants[basics+extras]'
        elif start_year is not None:
            query = f'search grants where researchers="{researcher_id}" and start_year>={start_year} return grants[basics+extras]'
        elif end_year is not None:
            query = f'search grants where researchers="{researcher_id}" and start_year<={end_year} return grants[basics+extras]'
        else:
            query = f'search grants where researchers="{researcher_id}" return grants[basics+extras]'

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        return result

    def org_pubs_most_cited(self, org_id, recent=False, iterative=False, limit=100, **kwargs):

        if recent:
            query = f"""search publications where research_orgs.id="{org_id}"
            return publications[basics+recent_citations]
            sort by recent_citations"""
        else:
            query = f"""search publications where research_orgs.id="{org_id}"
            return publications[basics+times_cited]
            sort by times_cited"""

        if iterative:
            result = self.query_iterative(query, limit=limit)
        else:
            query = f"{query} limit {limit}"
            result = self.query(query)

        return result

    def org_pubs_top_areas(self, org_id, iterative=False, limit=1000, return_plot=False, **kwargs):

            query = f"""search publications where research_orgs.id="{org_id}"
            return publications[doi+title+times_cited+category_for+journal]
            sort by times_cited"""

            if iterative:
                result = self.query_iterative(query, limit=limit)
            else:
                query = f"{query} limit {limit}"
                result = self.query(query)    

            raw_df = result.as_dataframe()
            dimcli.utils.normalize_key("category_for", result.publications, [])
            df = pd.json_normalize(result.publications, record_path='category_for', meta=['doi', 'title', 'times_cited', ], errors='ignore' )
            org_name = self.search_org_by_id(org_id).as_dataframe()["name"][0]
            if return_plot:
                area_fig = px.scatter(df,
                    x="times_cited", y="name",
                    marginal_x="histogram",
                    marginal_y="histogram",
                    hover_data=["doi", "title"],
                    height=600,
                    title=f"Publications from {org_name} - Research Areas vs. Citations")
                # return df, area_fig
                journal_fig = px.scatter(raw_df,
                            x="times_cited", y="journal.title",
                            marginal_x="histogram",
                            marginal_y="histogram",
                            height=600,
                            title=f"Publications from {org_name} - Journals vs. Citations")

                return df, area_fig, journal_fig
            else:
                return df

`init(self, key=None, **kwargs)` `special` ¶

Initialize the DSL.

Source code in scholarpy/scholarpy.py

def __init__(self, key=None, **kwargs):
    """Initialize the DSL."""
    if key is None:
        key = os.environ.get("DIM_TOKEN")
        if key is None:
            webbrowser.open("https://api-lab.dimensions.ai")
            raise ValueError(
                "No Dimensions API key can be found. Please go to https://www.dimensions.ai/contact-us to request an API key."
            )
        dimcli.login(key=key)

    super().__init__(**kwargs)

`h_index(self, id, iterative=False, limit=1000)` ¶

Get the h-index of a researcher.

Parameters:

Name	Type	Description	Default
`id`	`str`	The ID of the researcher. For example, res.010551261751.12	required
`iterative`	`bool`	If True, the query will be iterative. Defaults to False.	`False`
`limit`	`int`	The number of results to return. Defaults to 1000.	`1000`

Source code in scholarpy/scholarpy.py

def h_index(self, id, iterative=False, limit=1000):
    """Get the h-index of a researcher.

    Args:
        id (str): The ID of the researcher. For example, res.010551261751.12
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.
    """

    def the_H_function(sorted_citations_list, n=1):
        """from a list of integers [n1, n2 ..] representing publications citations,
        return the max list-position which is >= integer

        eg
        >>> the_H_function([10, 8, 5, 4, 3]) => 4
        >>> the_H_function([25, 8, 5, 3, 3]) => 3
        >>> the_H_function([1000, 20]) => 2
        """
        if sorted_citations_list and sorted_citations_list[0] >= n:
            return the_H_function(sorted_citations_list[1:], n + 1)
        else:
            return n - 1

    def get_pubs_citations(researcher_id, iterative=False, limit=1000):
        q = f'search publications where researchers.id = "{researcher_id}" return publications[times_cited] sort by times_cited'

        if iterative:
            result = self.query_iterative(q, limit=limit)
        else:
            q = f"{q} limit {limit}"
            result = self.query(q)

        return list(result.as_dataframe().fillna(0)["times_cited"])

    return the_H_function(get_pubs_citations(id, iterative=iterative, limit=limit))

`org_grant_funders(self, org_id, start_year=None, end_year=None, iterative=False, limit=20, return_plot=False, **kwargs)` ¶

Top funders of an organization.

Parameters:

Name	Type	Description	Default
`org_id`	`str`	The ID of the organization. For example, grid.411461.7	required
`start_year`	`int`	The start year of the publication. Defaults to None.	`None`
`end_year`	`int`	The end year of the publication. Defaults to None.	`None`
`iterative`	`bool`	If True, the query will be iterative. Defaults to False.	`False`
`limit`	`int`	The number of results to return. Defaults to 20.	`20`
`return_plot`	`bool`	If True, the plot of the results will be returned. Defaults to False.	`False`

Returns:

Type	Description
`dimcli.DslDataset`	JSON data of the results.

Source code in scholarpy/scholarpy.py

def org_grant_funders(self, org_id, start_year=None, end_year=None, iterative=False, limit=20, return_plot=False, **kwargs):
    """Top funders of an organization.

    Args:
        org_id (str): The ID of the organization. For example, grid.411461.7
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 20.
        return_plot (bool, optional): If True, the plot of the results will be returned. Defaults to False.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """

    if (start_year is not None) and (end_year is not None):
        query = f'search grants where research_orgs="{org_id}" and year>={start_year} and year<={end_year} return funders aggregate funding sort by funding'
    elif start_year is not None:
        query = f'search grants where research_orgs="{org_id}" and year>={start_year} return funders aggregate funding sort by funding'
    elif end_year is not None:
        query = f'search grants where research_orgs="{org_id}" and year<={end_year} return funders aggregate funding sort by funding'
    else:
        query = f'search grants where research_orgs="{org_id}" return funders aggregate funding sort by funding'

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    df = result.as_dataframe()
    if not return_plot:
        return df
    else:
        if not df.empty:
            org_name = self.search_org_by_id(
                org_id).as_dataframe()["name"][0]
            fig = px.bar(df,
                         x="name", y="funding",
                         title=f"Funding for {org_name} - by funder")
            return df, fig
        else:
            return df, None

`org_grants_annual_stats(self, org_id, start_year=None, end_year=None, iterative=False, limit=100, return_plot=False, **kwargs)` ¶

Search publications by organization ID.

Parameters:

Name	Type	Description	Default
`org_id`	`str`	The ID of the organization. For example, grid.411461.7	required
`start_year`	`int`	The start year of the publication. Defaults to None.	`None`
`end_year`	`int`	The end year of the publication. Defaults to None.	`None`
`iterative`	`bool`	If True, the query will be iterative. Defaults to False.	`False`
`limit`	`int`	The number of results to return. Defaults to 1000.	`100`
`return_plot`	`bool`	If True, the plot of the results will be returned. Defaults to False.	`False`

Returns:

Type	Description
`dimcli.DslDataset`	JSON data of the results.

Source code in scholarpy/scholarpy.py

def org_grants_annual_stats(self, org_id, start_year=None, end_year=None, iterative=False, limit=100, return_plot=False, **kwargs):
    """Search publications by organization ID.

    Args:
        org_id (str): The ID of the organization. For example, grid.411461.7
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.
        return_plot (bool, optional): If True, the plot of the results will be returned. Defaults to False.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """

    if (start_year is not None) and (end_year is not None):
        # query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} and end_date<="{end_year}-12-31" return start_year aggregate funding'
        query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} return start_year aggregate funding'
    elif start_year is not None:
        query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} return start_year aggregate funding'
    elif end_year is not None:
        query = f'search grants where research_orgs="{org_id}" return start_year aggregate funding'
    else:
        query = f'search grants where research_orgs="{org_id}" return start_year aggregate funding'

    # if limit is None and start_year is not None and end_year is not None:
    #     limit = end_year - start_year + 1
    # else:
    #     limit = 30

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    df = result.as_dataframe()
    df.rename(columns={"id": "year"}, inplace=True)

    if not return_plot:
        return df
    else:
        if not df.empty:
            org_name = self.search_org_by_id(
                org_id).as_dataframe()["name"][0]
            fig_count = px.bar(df, x="year", y="count",
                               title=f"The number of grants for {org_name} - by year")
            fig_amount = px.bar(df, x="year", y="funding",
                                title=f"The funding amount for {org_name} - by year")

            return df, fig_count, fig_amount
        else:
            return df, None, None

`org_pubs_annual_stats(self, org_id, start_year=None, end_year=None, iterative=False, limit=1000, return_plot=False, **kwargs)` ¶

Search publications by organization ID.

Parameters:

Name	Type	Description	Default
`org_id`	`str`	The ID of the organization. For example, grid.411461.7	required
`start_year`	`int`	The start year of the publication. Defaults to None.	`None`
`end_year`	`int`	The end year of the publication. Defaults to None.	`None`
`iterative`	`bool`	If True, the query will be iterative. Defaults to False.	`False`
`limit`	`int`	The number of results to return. Defaults to 1000.	`1000`
`return_plot`	`bool`	If True, the plot of the results will be returned. Defaults to False.	`False`

Returns:

Type	Description
`dimcli.DslDataset`	JSON data of the results.

Source code in scholarpy/scholarpy.py

def org_pubs_annual_stats(self, org_id, start_year=None, end_year=None, iterative=False, limit=1000, return_plot=False, **kwargs):
    """Search publications by organization ID.

    Args:
        org_id (str): The ID of the organization. For example, grid.411461.7
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.
        return_plot (bool, optional): If True, the plot of the results will be returned. Defaults to False.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """

    if (start_year is not None) and (end_year is not None):
        query = f'search publications where research_orgs="{org_id}" and year>={start_year} and year<={end_year} return year'
    elif start_year is not None:
        query = f'search publications where research_orgs="{org_id}" and year>={start_year} return year'
    elif end_year is not None:
        query = f'search publications where research_orgs="{org_id}" and year<={end_year} return year'
    else:
        query = f'search publications where research_orgs="{org_id}" return year'

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    df = result.as_dataframe()
    df.rename(columns={"id": "year"}, inplace=True)

    if not return_plot:
        return df
    else:
        if not df.empty:
            org_name = self.search_org_by_id(
                org_id).as_dataframe()["name"][0]
            fig = px.bar(df, x="year", y="count",
                         title=f"Publications from {org_name} - by year")
            return df, fig
        else:
            return df, None

`researcher_annual_stats(self, data, geonames_df=None)` ¶

Get the annual stats of a researcher, including publications, collaborators, collaborating institutions, and cities.

Parameters:

Name	Type	Description	Default
`data`	`dimcli.DslDataset`	JSON data of the input. It can be derived from search_pubs_by_researcher_id().	required
`geonames_df`	`pd.DataFrame`	The geonames dataframe. Defaults to None.	`None`

Returns:

Type	Description
`pd.DataFrame`	The dataframe of the results.

Source code in scholarpy/scholarpy.py

def researcher_annual_stats(self, data, geonames_df=None):
    """Get the annual stats of a researcher, including publications, collaborators, collaborating institutions, and cities.

    Args:
        data (dimcli.DslDataset): JSON data of the input. It can be derived from search_pubs_by_researcher_id().
        geonames_df (pd.DataFrame, optional): The geonames dataframe. Defaults to None.

    Returns:
        pd.DataFrame: The dataframe of the results.
    """
    pubs = data.as_dataframe()
    years_dict = pubs[["id", "year"]].set_index("id").to_dict()["year"]
    df = data.as_dataframe_authors()
    df["name"] = df["first_name"] + " " + df["last_name"]
    affiliations = df["affiliations"].values.tolist()

    institutions = []
    city_ids = []
    years = []
    cities = []
    countries = []
    latitudes = []
    longitudes = []

    ids = df["pub_id"].values.tolist()
    for index, a in enumerate(affiliations):
        try:
            institution = a[0]["name"]
            institutions.append(institution)
        except:
            institutions.append("")

        try:
            city_id = a[0]["city_id"]
            city_ids.append(city_id)
        except:
            city_ids.append(0)

        try:
            city = a[0]["city"]
            cities.append(city)
        except:
            cities.append("")

        try:
            country = a[0]["country"]
            countries.append(country)
        except:
            countries.append("")

        if geonames_df is not None:
            try:
                latitude, longitude = geoname_latlon(
                    city_ids[-1], geonames_df)
                latitudes.append(latitude)
                longitudes.append(longitude)
            except:
                latitudes.append(0)
                longitudes.append(0)

        years.append((years_dict[ids[index]]))

    df["year"] = years
    df["institution"] = institutions
    df["city"] = cities
    df["country"] = countries
    df["city_id"] = city_ids

    if geonames_df is not None:
        df["latitude"] = latitudes
        df["longitude"] = longitudes

    pubs_stats = pubs.groupby("year").size()
    collaborators_stats = (
        df.groupby(["year", "name"]).size().groupby(level=0).size()
    )
    institutions_stats = (
        df.groupby(["year", "institution"]).size().groupby(level=0).size()
    )
    cities_stats = df.groupby(
        ["year", "city_id"]).size().groupby(level=0).size()

    df2 = pd.DataFrame(
        {
            "year": cities_stats.index,
            "pubs": pubs_stats,
            "collaborators": collaborators_stats,
            "institutions": institutions_stats,
            "cities": cities_stats,
        }
    )
    if geonames_df is None:
        return df2
    else:
        return (
            df2,
            df[
                [
                    "name",
                    "year",
                    "institution",
                    "city",
                    "country",
                    "latitude",
                    "longitude",
                ]
            ],
        )

`researcher_pubs_authors(self, id, start_year=None, end_year=None, iterative=False, limit=1000)` ¶

Get the authors of a researcher's publications.

Parameters:

Name	Type	Description	Default
`id`	`str`	The ID of the researcher. For example, ur.010551261751.12	required
`start_year`	`int`	The start year of the publication. Defaults to None.	`None`
`end_year`	`int`	The end year of the publication. Defaults to None.	`None`
`iterative`	`bool`	If True, the query will be iterative. Defaults to False.	`False`
`limit`	`int`	The number of results to return. Defaults to 1000.	`1000`

Returns:

Type	Description
`pandas.DataFrame`	The dataframe of the results.

Source code in scholarpy/scholarpy.py

def researcher_pubs_authors(
    self, id, start_year=None, end_year=None, iterative=False, limit=1000
):
    """Get the authors of a researcher's publications.

    Args:
        id (str): The ID of the researcher. For example, ur.010551261751.12
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.

    Returns:
        pandas.DataFrame: The dataframe of the results.
    """

    result = self.search_pubs_by_researcher_id(
        id, start_year, end_year, iterative=iterative, limit=limit
    )
    pubs = result.as_dataframe()
    authors = result.as_dataframe_authors()["pub_id"].value_counts()
    df2 = pd.DataFrame({"id": authors.index, "authors": authors.values})
    df = pubs.join(df2.set_index("id"), on="id")
    return df

`researcher_pubs_stats(self, id, start_year=None, end_year=None, iterative=False, limit=1000, return_plot=False, **kwargs)` ¶

Get the publications stats of a researcher.

Parameters:

Name	Type	Description	Default
`id`	`str`	The ID of the researcher. For example, ur.010551261751.12	required
`start_year`	`int`	The start year of the publication. Defaults to None.	`None`
`end_year`	`int`	The end year of the publication. Defaults to None.	`None`
`iterative`	`bool`	If True, the query will be iterative. Defaults to False.	`False`
`limit`	`int`	The number of results to return. Defaults to 1000.	`1000`
`return_plot`	`bool`	If True, the plot will be returned. Defaults to False.	`False`

Returns:

Type	Description
`pandas.DataFrame`	The dataframe of the results.

Source code in scholarpy/scholarpy.py

def researcher_pubs_stats(
    self,
    id,
    start_year=None,
    end_year=None,
    iterative=False,
    limit=1000,
    return_plot=False,
    **kwargs,
):
    """Get the publications stats of a researcher.

    Args:
        id (str): The ID of the researcher. For example, ur.010551261751.12
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.
        return_plot (bool, optional): If True, the plot will be returned. Defaults to False.

    Returns:
        pandas.DataFrame: The dataframe of the results.
    """

    result = self.search_pubs_by_researcher_id(
        id, start_year, end_year, iterative=iterative, limit=limit
    )
    pubs = result.as_dataframe()
    df = pubs["year"].value_counts().sort_index()
    df2 = pd.DataFrame({"year": df.index, "citations": df.values})
    if return_plot:
        return df.plot.bar(**kwargs)
    else:
        return df2

`search_grants_by_org(self, org_id, start_year=None, end_year=None, iterative=False, limit=1000, **kwargs)` ¶

Search grants by organization id.

Parameters:

Name	Type	Description	Default
`org_id`	`str`	The ID of the organization. For example, grid.411461.7	required
`start_year`	`int`	The start year of the publication. Defaults to None.	`None`
`end_year`	`int`	The end year of the publication. Defaults to None.	`None`
`iterative`	`bool`	If True, the query will be iterative. Defaults to False.	`False`
`limit`	`int`	The number of results to return. Defaults to 20.	`1000`

Returns:

Type	Description
`dimcli.DslDataset`	JSON data of the results.

Source code in scholarpy/scholarpy.py

def search_grants_by_org(self, org_id, start_year=None, end_year=None, iterative=False, limit=1000, **kwargs):
    """Search grants by organization id.

    Args:
        org_id (str): The ID of the organization. For example, grid.411461.7
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 20.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """

    if (start_year is not None) and (end_year is not None):
        query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} return grants[basics+extras]'
    elif start_year is not None:
        query = f'search grants where research_orgs="{org_id}" and start_year>={start_year} return grants[basics+extras]'
    elif end_year is not None:
        query = f'search grants where research_orgs="{org_id}" and start_year<={end_year} return grants[basics+extras]'
    else:
        query = f'search grants where research_orgs="{org_id}" return grants[basics+extras]'

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    return result

`search_grants_by_researcher(self, researcher_id, start_year=None, end_year=None, iterative=False, limit=1000, **kwargs)` ¶

Search grants by researcher id.

Parameters:

Name	Type	Description	Default
`researcher_id`	`str`	The ID of the research. For example, ur.01361677540.55	required
`start_year`	`int`	The start year of the publication. Defaults to None.	`None`
`end_year`	`int`	The end year of the publication. Defaults to None.	`None`
`iterative`	`bool`	If True, the query will be iterative. Defaults to False.	`False`
`limit`	`int`	The number of results to return. Defaults to 1000.	`1000`

Returns:

Type	Description
`dimcli.DslDataset`	JSON data of the results.

Source code in scholarpy/scholarpy.py

def search_grants_by_researcher(self, researcher_id, start_year=None, end_year=None, iterative=False, limit=1000, **kwargs):
    """Search grants by researcher id.

    Args:
        researcher_id (str): The ID of the research. For example, ur.01361677540.55
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """

    if (start_year is not None) and (end_year is not None):
        query = f'search grants where researchers="{researcher_id}" and start_year>={start_year} return grants[basics+extras]'
    elif start_year is not None:
        query = f'search grants where researchers="{researcher_id}" and start_year>={start_year} return grants[basics+extras]'
    elif end_year is not None:
        query = f'search grants where researchers="{researcher_id}" and start_year<={end_year} return grants[basics+extras]'
    else:
        query = f'search grants where researchers="{researcher_id}" return grants[basics+extras]'

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    return result

`search_journal_by_id(self, id, fields=None, **kwargs)` ¶

Search a journal by ID.

Parameters:

Name	Type	Description	Default
`id`	`str`	The ID of the journal. For example, jour.1018957	required
`fields`	`str`	The fields to return. For example, [basics+extras]. Defaults to None.	`None`

Returns:

Type	Description
`dimcli.DslDataset`	JSON data of the results.

Source code in scholarpy/scholarpy.py

def search_journal_by_id(self, id, fields=None, **kwargs):
    """Search a journal by ID.

    Args:
        id (str): The ID of the journal. For example, jour.1018957
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """

    if fields is None:
        fields = ""
    query = f'search source_titles where id="{id}" return source_titles{fields}'
    return self.query(query)

`search_journal_by_title(self, title, exact_match=True, fields=None, iterative=False, limit=1000, **kwargs)` ¶

Search a journal by title.

Parameters:

Name	Type	Description	Default
`title`	`str`	The title of the journal.	required
`exact_match`	`bool`	If True, the title must be an exact match. Defaults to True.	`True`
`fields`	`str`	The fields to return. For example, [basics+extras]. Defaults to None.	`None`
`iterative`	`bool`	If True, the query will be iterative. Defaults to False.	`False`
`limit`	`int`	The number of results to return. Defaults to 1000.	`1000`

Returns:

Type	Description
`dimcli.DslDataset`	JSON data of the results.

Source code in scholarpy/scholarpy.py

def search_journal_by_title(
    self,
    title,
    exact_match=True,
    fields=None,
    iterative=False,
    limit=1000,
    **kwargs,
):
    """Search a journal by title.

    Args:
        title (str): The title of the journal.
        exact_match (bool, optional): If True, the title must be an exact match. Defaults to True.
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """
    if fields is None:
        fields = ""
    query = f'search source_titles for "{title}" return source_titles{fields}'

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    try:
        if exact_match:
            df = result.as_dataframe()
            sub_df = df[df["title"].str.lower() == title.lower()]
            journal_id = sub_df["id"].values.tolist()[0]
            query = f'search source_titles where id="{journal_id}" return source_titles{fields}'
            return self.query(query)

        else:
            return result
    except Exception as e:
        print("No journal can be found.")
        return None

`search_orcid_by_name(self, name, fields=None, iterative=False, limit=1000, return_list=False, **kwargs)` ¶

Search a researcher orcid by name.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the researcher.	required
`fields`	`str`	The fields to return. For example, [basics+extras]. Defaults to None.	`None`
`iterative`	`bool`	If True, the query will be iterative. Defaults to False.	`False`
`limit`	`int`	The number of results to return. Defaults to 1000.	`1000`
`return_list`	`bool`	If True, the results will be returned as a list. Defaults to False.	`False`

Returns:

Type	Description
`pd.DataFrame`	A dataframe of the results.

Source code in scholarpy/scholarpy.py

def search_orcid_by_name(
    self,
    name,
    fields=None,
    iterative=False,
    limit=1000,
    return_list=False,
    **kwargs,
):
    """Search a researcher orcid by name.

    Args:
        name (str): The name of the researcher.
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.
        return_list (bool, optional): If True, the results will be returned as a list. Defaults to False.

    Returns:
        pd.DataFrame: A dataframe of the results.
    """

    result = self.search_researcher_by_name(
        name, fields=fields, iterative=iterative, limit=limit, return_list=False
    )
    df = result.as_dataframe()
    if "orcid_id" in df.columns:
        df = df[~df["orcid_id"].isnull()]
        ids = [id[0] for id in df["orcid_id"].values.tolist()]
        df["orcid_id"] = ids
        # df.sort_values(by=["orcid_id"], inplace=True)
        df["current_research_org_name"] = df["current_research_org.name"]
        df.sort_values("current_research_org_name", inplace=True)
        if return_list:
            df["uid"] = (
                df["first_name"]
                + " "
                + df["last_name"]
                + " | "
                + df["orcid_id"]
                + " | "
                + df["current_research_org.name"]
            )
            df = df[~df["uid"].isnull()]
            return df["uid"].tolist()
        else:
            return df
    else:
        return None

`search_org_by_id(self, id, fields=None, **kwargs)` ¶

Search an organization by ID. For example, grid.411461.7

Parameters:

Name	Type	Description	Default
`id`	`str`	The ID of the organization. For example, org.010551261751.12	required
`fields`	`str`	The fields to return. For example, [basics+extras]. Defaults to None.	`None`

Returns:

Type	Description
`dimcli.DslDataset`	JSON data of the results.

Source code in scholarpy/scholarpy.py

def search_org_by_id(self, id, fields=None, **kwargs):
    """Search an organization by ID. For example, grid.411461.7

    Args:
        id (str): The ID of the organization. For example, org.010551261751.12
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """
    if fields is None:
        fields = ""
    query = f'search organizations where id="{id}" return organizations{fields}'
    return self.query(query)

`search_org_by_name(self, name, exact_match=True, fields=None, iterative=False, limit=1000, return_list=False, **kwargs)` ¶

Search an organization by name.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the organization.	required
`exact_match`	`bool`	If True, the name must be an exact match. Defaults to True.	`True`
`fields`	`str`	The fields to return. For example, [basics+extras]. Defaults to None.	`None`
`iterative`	`bool`	If True, the query will be iterative. Defaults to False.	`False`
`limit`	`int`	The number of results to return. Defaults to 1000.	`1000`
`return_list`	`bool`	If True, the results will be returned as a list. Defaults to False.	`False`

Returns:

Type	Description
`dimcli.DslDataset`	JSON data of the results.

Source code in scholarpy/scholarpy.py

def search_org_by_name(
    self, name, exact_match=True, fields=None, iterative=False, limit=1000, return_list=False, **kwargs
):
    """Search an organization by name.

    Args:
        name (str): The name of the organization.
        exact_match (bool, optional): If True, the name must be an exact match. Defaults to True.
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.
        return_list (bool, optional): If True, the results will be returned as a list. Defaults to False.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """
    if fields is None:
        fields = ""
    query = f'search organizations for "\\"{name}\\"" return organizations{fields}'
    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    try:
        if exact_match:
            df = result.as_dataframe()
            sub_df = df[df["name"].str.lower() == name.lower()]
            org_id = sub_df["id"].values.tolist()[0]
            query = f'search organizations where id="{org_id}" return organizations{fields}'
            result = self.query(query)

        if return_list:

            df = result.as_dataframe()
            if not df.empty:
                df.sort_values("name", inplace=True)
                df["name_id"] = df["id"] + " | " + df["name"]
                orgs = df["name_id"].values.tolist()
                return orgs
            else:
                return None
        else:
            return result

    except Exception as e:
        print("No organization can be found.")
        return None

`search_pubs_by_journal_id(self, id, start_year=None, end_year=None, fields=None, iterative=False, limit=1000, **kwargs)` ¶

Search publications by journal ID.

Parameters:

Name	Type	Description	Default
`id`	`str`	The ID of the journal. For example, jour.1018957	required
`start_year`	`int`	The start year of the publication. Defaults to None.	`None`
`end_year`	`int`	The end year of the publication. Defaults to None.	`None`
`fields`	`str`	The fields to return. For example, [basics+extras]. Defaults to None.	`None`
`iterative`	`bool`	If True, the query will be iterative. Defaults to False.	`False`
`limit`	`int`	The number of results to return. Defaults to 1000.	`1000`

Returns:

Type	Description
`dimcli.DslDataset`	JSON data of the results.

Source code in scholarpy/scholarpy.py

def search_pubs_by_journal_id(
    self,
    id,
    start_year=None,
    end_year=None,
    fields=None,
    iterative=False,
    limit=1000,
    **kwargs,
):
    """Search publications by journal ID.

    Args:
        id (str): The ID of the journal. For example, jour.1018957
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """
    if fields is None:
        fields = ""

    if (start_year is not None) and (end_year is not None):
        query = f'search publications where journal.id="{id}" and year>={start_year} and year<={end_year} return publications{fields}'
    elif start_year is not None:
        query = f'search publications where journal.id="{id}" and year>={start_year} return publications{fields}'
    elif end_year is not None:
        query = f'search publications where journal.id="{id}" and year<={end_year} return publications{fields}'
    else:
        query = f'search publications where journal.id="{id}" return publications{fields}'

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    return result

`search_pubs_by_keyword(self, keyword, exact_match=True, scope='title_abstract_only', start_year=None, end_year=None, journal_id=None, fields=None, sorted_field='times_cited', iterative=False, limit=1000, **kwargs)` ¶

Search publications by keyword.

Parameters:

Name	Type	Description	Default
`keyword`	`str`	The keyword to search.	required
`exact_match`	`bool`	If True, the keyword will be matched exactly. Defaults to True.	`True`
`scope`	`str`	The scope of the search. Defaults to "title_abstract_only".	`'title_abstract_only'`
`start_year`	`int`	The start year of the publication. Defaults to None.	`None`
`end_year`	`int`	The end year of the publication. Defaults to None.	`None`
`fields`	`str`	The fields to return. For example, [basics+extras]. Defaults to None.	`None`
`sorted_field`	`str`	The field to sort by. Defaults to "times_cited".	`'times_cited'`
`iterative`	`bool`	If True, the query will be iterative. Defaults to False.	`False`
`limit`	`int`	The number of results to return. Defaults to 1000.	`1000`

Exceptions:

Type	Description
`ValueError`	[description]

Returns:

Type	Description
`dimcli.DslDataset`	JSON data of the results.

Source code in scholarpy/scholarpy.py

def search_pubs_by_keyword(
    self,
    keyword,
    exact_match=True,
    scope="title_abstract_only",
    start_year=None,
    end_year=None,
    journal_id=None,
    fields=None,
    sorted_field="times_cited",
    iterative=False,
    limit=1000,
    **kwargs,
):
    """Search publications by keyword.

    Args:
        keyword (str): The keyword to search.
        exact_match (bool, optional): If True, the keyword will be matched exactly. Defaults to True.
        scope (str, optional): The scope of the search. Defaults to "title_abstract_only".
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
        sorted_field (str, optional): The field to sort by. Defaults to "times_cited".
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.

    Raises:
        ValueError: [description]

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """
    if exact_match:
        exact = '\\"'
    else:
        exact = ""

    if journal_id is not None:
        journal = f'and journal.id="{journal_id}"'
    else:
        journal = ""

    allowed_scopes = [
        "authors",
        "concepts",
        "full_data",
        "full_data_exact",
        "title_abstract_only",
        "title_only",
    ]

    if scope not in allowed_scopes:
        raise ValueError(f"scope must be one of {allowed_scopes}")

    if fields is None:
        fields = "[basics+altmetric+times_cited+field_citation_ratio+authors_count+doi+dimensions_url]"

    if (start_year is not None) and (end_year is not None):
        query = f'search publications in {scope} for "{exact}{keyword}{exact}" where year>={start_year} and year<={end_year} {journal} return publications{fields} sort by {sorted_field}'
    elif start_year is not None:
        query = f'search publications in {scope} for "{exact}{keyword}{exact}" where year>={start_year} {journal} return publications{fields} sort by {sorted_field}'
    elif end_year is not None:
        query = f'search publications in {scope} for "{exact}{keyword}{exact}" where year<={end_year} {journal} return publications{fields} sort by {sorted_field}'
    else:
        if journal_id is not None:
            journal = f'where journal.id="{journal_id}"'
        else:
            journal = ""
        query = f'search publications in {scope} for "{exact}{keyword}{exact}" {journal} return publications{fields} sort by {sorted_field}'

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    return result

`search_pubs_by_org_id(self, id, start_year=None, end_year=None, fields=None, iterative=False, limit=1000, **kwargs)` ¶

Search publications by organization ID.

Parameters:

Name	Type	Description	Default
`id`	`str`	The ID of the organization. For example, grid.411461.7	required
`start_year`	`int`	The start year of the publication. Defaults to None.	`None`
`end_year`	`int`	The end year of the publication. Defaults to None.	`None`
`fields`	`str`	The fields to return. For example, [basics+extras]. Defaults to None.	`None`
`iterative`	`bool`	If True, the query will be iterative. Defaults to False.	`False`
`limit`	`int`	The number of results to return. Defaults to 1000.	`1000`

Returns:

Type	Description
`dimcli.DslDataset`	JSON data of the results.

Source code in scholarpy/scholarpy.py

def search_pubs_by_org_id(
    self,
    id,
    start_year=None,
    end_year=None,
    fields=None,
    iterative=False,
    limit=1000,
    **kwargs,
):
    """Search publications by organization ID.

    Args:
        id (str): The ID of the organization. For example, grid.411461.7
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """
    if fields is None:
        fields = ""

    if (start_year is not None) and (end_year is not None):
        query = f'search publications where research_orgs="{id}" and year>={start_year} and year<={end_year} return publications{fields}'
    elif start_year is not None:
        query = f'search publications where research_orgs="{id}" and year>={start_year} return publications{fields}'
    elif end_year is not None:
        query = f'search publications where research_orgs="{id}" and year<={end_year} return publications{fields}'
    else:
        query = f'search publications where research_orgs="{id}" return publications{fields}'

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    return result

`search_pubs_by_researcher_id(self, id, start_year=None, end_year=None, fields=None, iterative=False, limit=1000, **kwargs)` ¶

Search publications by researcher ID.

Parameters:

Name	Type	Description	Default
`id`	`str`	The ID of the researcher. For example, res.010551261751.12	required
`start_year`	`int`	The start year of the publication. Defaults to None.	`None`
`end_year`	`int`	The end year of the publication. Defaults to None.	`None`
`fields`	`str`	The fields to return. For example, [basics+extras]. Defaults to None.	`None`
`iterative`	`bool`	If True, the query will be iterative. Defaults to False.	`False`
`limit`	`int`	The number of results to return. Defaults to 1000.	`1000`

Returns:

Type	Description
`dimcli.DslDataset`	JSON data of the results.

Source code in scholarpy/scholarpy.py

def search_pubs_by_researcher_id(
    self,
    id,
    start_year=None,
    end_year=None,
    fields=None,
    iterative=False,
    limit=1000,
    **kwargs,
):
    """Search publications by researcher ID.

    Args:
        id (str): The ID of the researcher. For example, res.010551261751.12
        start_year (int, optional): The start year of the publication. Defaults to None.
        end_year (int, optional): The end year of the publication. Defaults to None.
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """
    if fields is None:
        fields = "[basics+authors_count+times_cited+dimensions_url]"

    if (start_year is not None) and (end_year is not None):
        query = f'search publications where researchers.id="{id}" and year>={start_year} and year<={end_year} return publications{fields}'
    elif start_year is not None:
        query = f'search publications where researchers.id="{id}" and year>={start_year} return publications{fields}'
    elif end_year is not None:
        query = f'search publications where researchers.id="{id}" and year<={end_year} return publications{fields}'
    else:
        query = f'search publications where researchers.id="{id}" return publications{fields}'

    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    return result

`search_researcher_by_id(self, id, fields=None, iterative=False, limit=1000, return_df=False, **kwargs)` ¶

Search a researcher by ID.

Parameters:

Name	Type	Description	Default
`id`	`str`	The ID of the researcher. For example, ur.010551261751.12	required
`fields`	`str`	The fields to return. For example, [basics+extras]. Defaults to None.	`None`
`iterative`	`bool`	If True, the query will be iterative. Defaults to False.	`False`
`limit`	`int`	The number of results to return. Defaults to 1000.	`1000`
`return_df`	`bool`	If True, the results will be returned as a dataframe. Defaults to False.	`False`

Returns:

Type	Description
`dimcli.DslDataset`	JSON data of the results.

Source code in scholarpy/scholarpy.py

def search_researcher_by_id(
    self,
    id,
    fields=None,
    iterative=False,
    limit=1000,
    return_df=False,
    **kwargs,
):
    """Search a researcher by ID.

    Args:
        id (str): The ID of the researcher. For example, ur.010551261751.12
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.
        return_df (bool, optional): If True, the results will be returned as a dataframe. Defaults to False.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """
    """Search a researcher by a Dimensions Research ID. For example, ur.010551261751.12"""

    if fields is None:
        fields = "[basics+extras]"

    query = f'search researchers where id="{id}" return researchers{fields}'

    if iterative:
        result = self.query_iterative(query, limit=limit, **kwargs)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)
    if return_df:
        df2 = result.as_dataframe().transpose()
        df3 = pd.DataFrame(
            {"index": df2.index.tolist(), "value": df2.values.tolist()}
        )
        df3["value"] = df3["value"]
        print(type(df3["value"]))
        return df3
    else:
        return result

`search_researcher_by_name(self, name, fields=None, iterative=False, limit=1000, return_list=False, **kwargs)` ¶

Search a researcher by name.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the researcher.	required
`fields`	`str`	The fields to return. For example, [basics+extras]. Defaults to None.	`None`
`iterative`	`bool`	If True, the query will be iterative. Defaults to False.	`False`
`limit`	`int`	The number of results to return. Defaults to 1000.	`1000`
`return_list`	`bool`	If True, the results will be returned as a list. Defaults to False.	`False`

Returns:

Type	Description
`dimcli.DslDataset`	JSON data of the results.

Source code in scholarpy/scholarpy.py

def search_researcher_by_name(
    self,
    name,
    fields=None,
    iterative=False,
    limit=1000,
    return_list=False,
    **kwargs,
):
    """Search a researcher by name.

    Args:
        name (str): The name of the researcher.
        fields (str, optional): The fields to return. For example, [basics+extras]. Defaults to None.
        iterative (bool, optional): If True, the query will be iterative. Defaults to False.
        limit (int, optional): The number of results to return. Defaults to 1000.
        return_list (bool, optional): If True, the results will be returned as a list. Defaults to False.

    Returns:
        dimcli.DslDataset: JSON data of the results.
    """

    if fields is None:
        fields = "[basics+extras]"

    query = f'search researchers for "\\"{name}\\"" where obsolete=0 and total_publications>0 return researchers{fields}'
    if iterative:
        result = self.query_iterative(query, limit=limit)
    else:
        query = f"{query} limit {limit}"
        result = self.query(query)

    if return_list:
        df = result.as_dataframe()
        if not df.empty:
            # df.sort_values(by=["id"], inplace=True)
            df["current_research_org_name"] = df["current_research_org.name"]
            df.sort_values("current_research_org_name", inplace=True)
            if not df.empty:
                items = []
                for row in df.itertuples():
                    item = (
                        str(row.first_name)
                        + " "
                        + str(row.last_name)
                        + " | "
                        + str(row.id)
                        + " | "
                        + str(row.current_research_org_name)
                    )
                    items.append(item)

                return result, items
        else:
            return result, None
    else:
        return result

`search_researcher_collaborators(self, id, pubs=None)` ¶

Search collaborators of a researcher.

Parameters:

Name	Type	Description	Default
`id`	`str`	The ID of the researcher. For example, ur.010551261751.12	required
`pubs`	`dimcli.DslDataset`	The publications of the researcher. Defaults to None.	`None`

Returns:

Type	Description
`pd.DataFrame`	A dataframe of the collaborators.

Source code in scholarpy/scholarpy.py

def search_researcher_collaborators(self, id, pubs=None):
    """Search collaborators of a researcher.

    Args:
        id (str): The ID of the researcher. For example, ur.010551261751.12
        pubs (dimcli.DslDataset, optional): The publications of the researcher. Defaults to None.
    Returns:
        pd.DataFrame: A dataframe of the collaborators.
    """
    if pubs is None:
        pubs = self.search_pubs_by_researcher_id(id)
    df = pubs.as_dataframe_authors()
    df = df[df["researcher_id"] != id]
    df["Name"] = df["first_name"].str.split(
        " ").str[0] + " " + df["last_name"]
    names = df.drop_duplicates("Name").copy()
    affiliations = names["affiliations"].values.tolist()
    institutions = []
    for a in enumerate(affiliations):
        try:
            institution = a[1][0]["name"]
            institutions.append(institution)
        except:
            institutions.append("")

    names["Institution"] = institutions
    names = names[["Name", "Institution"]]

    result = pd.DataFrame(df["Name"].value_counts())
    result = pd.DataFrame(
        {"Name": result.index, "Count": result["Name"].values})
    return result.merge(names, on="Name")

`annual_stats_barplot(df, columns=None, **kwargs)` ¶

Get a barplot of the annual stats.

Parameters:

Name	Type	Description	Default
`df`	`pd.DataFrame`	The dataframe of the results. Can be derived from df, _ = dsl.researcher_annual_stats().	required
`columns`	`list`	The columns to plot. Defaults to None.	`None`

Returns:

Type	Description
`px.Bar`	The barplot.

Source code in scholarpy/scholarpy.py

def annual_stats_barplot(df, columns=None, **kwargs):
    """Get a barplot of the annual stats.

    Args:
        df (pd.DataFrame): The dataframe of the results. Can be derived from df, _ = dsl.researcher_annual_stats().
        columns (list, optional): The columns to plot. Defaults to None.

    Returns:
        px.Bar: The barplot.
    """
    if columns is None:
        columns = ["pubs", "collaborators", "institutions", "cities"]
    fig = px.bar(
        df,
        x="year",
        y=columns,
        barmode="group",
    )
    return fig

`collaborator_locations(df)` ¶

Get the locations of collaborators.

Parameters:

Name	Type	Description	Default
`df`	`pd.DataFrame`	The dataframe of collaborators, can be derived from _, df = dsl.researcher_annual_stats().	required

Returns:

Type	Description
`pd.DataFrame`	The dataframe of the results.

Source code in scholarpy/scholarpy.py

def collaborator_locations(df):
    """Get the locations of collaborators.

    Args:
        df (pd.DataFrame): The dataframe of collaborators, can be derived from _, df = dsl.researcher_annual_stats().

    Returns:
        pd.DataFrame: The dataframe of the results.
    """
    if "name" in df.columns:
        df.drop(columns=["name"], axis=1, inplace=True)

    if "year" in df.columns:
        df.drop(columns=["year"], axis=1, inplace=True)

    if "index" in df.columns:
        df.drop(columns=["index"], axis=1, inplace=True)

    df.drop_duplicates(inplace=True)
    df = df[df["latitude"] != 0].reset_index()
    df.drop(columns=["index"], axis=1, inplace=True)

    return df

`geoname_latlon(id, df=None)` ¶

Get the latitude and longitude of a city based on a GeoNmae city id.

Parameters:

Name	Type	Description	Default
`id`	`str`	The GeoName city id.	required
`df`	`pd.DataFrame`	The geonames dataframe. Defaults to None.	`None`

Returns:

Type	Description
`tuple`	The latitude and longitude of the city.

Source code in scholarpy/scholarpy.py

def geoname_latlon(id, df=None):
    """Get the latitude and longitude of a city based on a GeoNmae city id.

    Args:
        id (str): The GeoName city id.
        df (pd.DataFrame, optional): The geonames dataframe. Defaults to None.

    Returns:
        tuple: The latitude and longitude of the city.
    """
    if not isinstance(id, int):
        try:
            id = int(id)
        except:
            raise ValueError("id must be an integer")

    if df is None:
        df = get_geonames()

    row = df[df["geonameid"] == id]
    lat = 0
    lon = 0
    if not row.empty:
        lat = row.iloc[0]["latitude"]
        lon = row.iloc[0]["longitude"]

    return lat, lon

`get_geonames(**kwargs)` ¶

Get the geonames dataframe.

Returns:

Type	Description
`pd.DataFrame`	The dataframe of the results.

Source code in scholarpy/scholarpy.py

def get_geonames(**kwargs):
    """Get the geonames dataframe.

    Returns:
        pd.DataFrame: The dataframe of the results.
    """
    url = "https://raw.githubusercontent.com/giswqs/data/main/world/cities5000.csv"
    # columns = ['geonameid', 'name', 'asciiname', 'alternatenames', 'latitude', 'longitude', 'feature_class', 'feature_code', 'country_code',
    #            'cc2', 'admin1_code', 'admin2_code', 'admin3_code', 'admin4_code', 'population', 'elevation', 'dem', 'timezone', 'modification_date']
    df = pd.read_csv(url, sep="\t", encoding="utf-8")

    if "columns" in kwargs and isinstance(kwargs["columns"], list):
        df = df[kwargs["columns"]]
    else:
        df = df[
            ["geonameid", "name", "country_code",
                "population", "latitude", "longitude"]
        ]

    return df

`json_to_df(json_data, transpose=False)` ¶

Convert a json file to a dataframe.

Parameters:

Name	Type	Description	Default
`json_data`	`json`	The json data.	required
`transpose`	`bool`	If True, transpose the dataframe. Defaults to False.	`False`

Returns:

Type	Description
`pd.DataFrame`	The dataframe of the results.

Source code in scholarpy/scholarpy.py

def json_to_df(json_data, transpose=False):
    """Convert a json file to a dataframe.

    Args:
        json_data (json): The json data.
        transpose (bool, optional): If True, transpose the dataframe. Defaults to False.

    Returns:
        pd.DataFrame: The dataframe of the results.
    """
    df = json_data.as_dataframe()
    if not df.empty:
        if transpose:
            df = df.transpose()

        out_csv = leafmap.temp_file_path(".csv")
        df.to_csv(out_csv, index=transpose)
        df = pd.read_csv(out_csv)
        os.remove(out_csv)
        return df
    else:
        return None

Last update: 2022-02-09

scholarpy module¶

Dsl (Dsl) ¶

__init__(self, key=None, **kwargs) special ¶

h_index(self, id, iterative=False, limit=1000) ¶

org_grant_funders(self, org_id, start_year=None, end_year=None, iterative=False, limit=20, return_plot=False, **kwargs) ¶

org_grants_annual_stats(self, org_id, start_year=None, end_year=None, iterative=False, limit=100, return_plot=False, **kwargs) ¶

org_pubs_annual_stats(self, org_id, start_year=None, end_year=None, iterative=False, limit=1000, return_plot=False, **kwargs) ¶

researcher_annual_stats(self, data, geonames_df=None) ¶

researcher_pubs_authors(self, id, start_year=None, end_year=None, iterative=False, limit=1000) ¶

researcher_pubs_stats(self, id, start_year=None, end_year=None, iterative=False, limit=1000, return_plot=False, **kwargs) ¶

search_grants_by_org(self, org_id, start_year=None, end_year=None, iterative=False, limit=1000, **kwargs) ¶

search_grants_by_researcher(self, researcher_id, start_year=None, end_year=None, iterative=False, limit=1000, **kwargs) ¶

search_journal_by_id(self, id, fields=None, **kwargs) ¶

search_journal_by_title(self, title, exact_match=True, fields=None, iterative=False, limit=1000, **kwargs) ¶

search_orcid_by_name(self, name, fields=None, iterative=False, limit=1000, return_list=False, **kwargs) ¶

search_org_by_id(self, id, fields=None, **kwargs) ¶

search_org_by_name(self, name, exact_match=True, fields=None, iterative=False, limit=1000, return_list=False, **kwargs) ¶

search_pubs_by_journal_id(self, id, start_year=None, end_year=None, fields=None, iterative=False, limit=1000, **kwargs) ¶

search_pubs_by_keyword(self, keyword, exact_match=True, scope='title_abstract_only', start_year=None, end_year=None, journal_id=None, fields=None, sorted_field='times_cited', iterative=False, limit=1000, **kwargs) ¶

search_pubs_by_org_id(self, id, start_year=None, end_year=None, fields=None, iterative=False, limit=1000, **kwargs) ¶

search_pubs_by_researcher_id(self, id, start_year=None, end_year=None, fields=None, iterative=False, limit=1000, **kwargs) ¶

search_researcher_by_id(self, id, fields=None, iterative=False, limit=1000, return_df=False, **kwargs) ¶

search_researcher_by_name(self, name, fields=None, iterative=False, limit=1000, return_list=False, **kwargs) ¶

search_researcher_collaborators(self, id, pubs=None) ¶

annual_stats_barplot(df, columns=None, **kwargs) ¶

collaborator_locations(df) ¶

geoname_latlon(id, df=None) ¶

get_geonames(**kwargs) ¶

json_to_df(json_data, transpose=False) ¶

`Dsl (Dsl)` ¶

`init(self, key=None, **kwargs)` `special` ¶

`h_index(self, id, iterative=False, limit=1000)` ¶

`org_grant_funders(self, org_id, start_year=None, end_year=None, iterative=False, limit=20, return_plot=False, **kwargs)` ¶

`org_grants_annual_stats(self, org_id, start_year=None, end_year=None, iterative=False, limit=100, return_plot=False, **kwargs)` ¶

`org_pubs_annual_stats(self, org_id, start_year=None, end_year=None, iterative=False, limit=1000, return_plot=False, **kwargs)` ¶

`researcher_annual_stats(self, data, geonames_df=None)` ¶

`researcher_pubs_authors(self, id, start_year=None, end_year=None, iterative=False, limit=1000)` ¶

`researcher_pubs_stats(self, id, start_year=None, end_year=None, iterative=False, limit=1000, return_plot=False, **kwargs)` ¶

`search_grants_by_org(self, org_id, start_year=None, end_year=None, iterative=False, limit=1000, **kwargs)` ¶

`search_grants_by_researcher(self, researcher_id, start_year=None, end_year=None, iterative=False, limit=1000, **kwargs)` ¶

`search_journal_by_id(self, id, fields=None, **kwargs)` ¶

`search_journal_by_title(self, title, exact_match=True, fields=None, iterative=False, limit=1000, **kwargs)` ¶

`search_orcid_by_name(self, name, fields=None, iterative=False, limit=1000, return_list=False, **kwargs)` ¶

`search_org_by_id(self, id, fields=None, **kwargs)` ¶

`search_org_by_name(self, name, exact_match=True, fields=None, iterative=False, limit=1000, return_list=False, **kwargs)` ¶

`search_pubs_by_journal_id(self, id, start_year=None, end_year=None, fields=None, iterative=False, limit=1000, **kwargs)` ¶

`search_pubs_by_keyword(self, keyword, exact_match=True, scope='title_abstract_only', start_year=None, end_year=None, journal_id=None, fields=None, sorted_field='times_cited', iterative=False, limit=1000, **kwargs)` ¶

`search_pubs_by_org_id(self, id, start_year=None, end_year=None, fields=None, iterative=False, limit=1000, **kwargs)` ¶

`search_pubs_by_researcher_id(self, id, start_year=None, end_year=None, fields=None, iterative=False, limit=1000, **kwargs)` ¶

`search_researcher_by_id(self, id, fields=None, iterative=False, limit=1000, return_df=False, **kwargs)` ¶

`search_researcher_by_name(self, name, fields=None, iterative=False, limit=1000, return_list=False, **kwargs)` ¶

`search_researcher_collaborators(self, id, pubs=None)` ¶

`annual_stats_barplot(df, columns=None, **kwargs)` ¶

`collaborator_locations(df)` ¶

`geoname_latlon(id, df=None)` ¶

`get_geonames(**kwargs)` ¶

`json_to_df(json_data, transpose=False)` ¶