You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

369 lines
10 KiB

import requests
import json
import argparse
from typing import Optional, List, Literal, Union
from colorprinter.print_color import *
def search_semantic_scholar(
query: str,
limit: int = 10,
fields: Optional[List[str]] = None,
publication_types: Optional[
List[
Literal[
"Review",
"JournalArticle",
"CaseReport",
"ClinicalTrial",
"Conference",
"Dataset",
"Editorial",
"LettersAndComments",
"MetaAnalysis",
"News",
"Study",
"Book",
"BookSection",
]
]
] = ["JournalArticle"],
open_access: bool = False,
min_citation_count: Optional[int] = None,
date_range: Optional[str] = None,
year_range: Optional[str] = None,
fields_of_study: Optional[
List[
Literal[
"Computer Science",
"Medicine",
"Chemistry",
"Biology",
"Materials Science",
"Physics",
"Geology",
"Psychology",
"Art",
"History",
"Geography",
"Sociology",
"Business",
"Political Science",
"Economics",
"Philosophy",
"Mathematics",
"Engineering",
"Environmental Science",
"Agricultural and Food Sciences",
"Education",
"Law",
"Linguistics",
]
]
] = None,
):
"""
Search for papers on Semantic Scholar with various filters.
Parameters:
-----------
query : str
The search query term
limit : int
Number of results to return (max 100)
fields : List[str], optional
List of fields to include in the response
publication_types : List[str], optional
Filter by publication types
open_access : bool
Only include papers with open access PDFs
min_citation_count : int, optional
Minimum number of citations
date_range : str, optional
Date range in format "YYYY-MM-DD:YYYY-MM-DD"
year_range : str, optional
Year range in format "YYYY-YYYY" or "YYYY-" or "-YYYY"
fields_of_study : List[str], optional
List of fields of study to filter by
Returns:
--------
dict
JSON response containing search results
"""
# Define the API endpoint URL
url = "https://api.semanticscholar.org/graph/v1/paper/search"
# Set up default fields if not provided
if fields is None:
fields = [
"title",
"url",
"abstract",
"year",
"publicationDate",
"authors.name",
"citationCount",
"openAccessPdf",
"tldr",
]
# Build query parameters
params = {"query": query, "limit": limit, "fields": ",".join(fields)}
# Add optional filters if provided
if publication_types:
params["publicationTypes"] = ",".join(publication_types)
if open_access:
params["openAccessPdf"] = ""
if min_citation_count:
params["minCitationCount"] = str(min_citation_count)
if date_range:
params["publicationDateOrYear"] = date_range
if year_range:
params["year"] = year_range
if fields_of_study:
params["fieldsOfStudy"] = ",".join(fields_of_study)
# Send the API request
try:
response = requests.get(url, params=params)
response.raise_for_status() # Raise an exception for HTTP errors
return response.json().get("data", [])
except requests.exceptions.HTTPError as e:
print(f"HTTP Error: {e}")
print(f"Response text: {response.text}")
return None
except requests.exceptions.RequestException as e:
print(f"Error: {e}")
return None
def main(
query: Optional[str] = None,
limit: int = 10,
fields: Optional[List[str]] = None,
publication_types: Optional[
List[
Literal[
"Review",
"JournalArticle",
"CaseReport",
"ClinicalTrial",
"Conference",
"Dataset",
"Editorial",
"LettersAndComments",
"MetaAnalysis",
"News",
"Study",
"Book",
"BookSection",
]
]
] = None,
open_access: bool = False,
min_citation_count: Optional[int] = None,
date_range: Optional[str] = None,
year_range: Optional[str] = None,
fields_of_study: Optional[
List[
Literal[
"Computer Science",
"Medicine",
"Chemistry",
"Biology",
"Materials Science",
"Physics",
"Geology",
"Psychology",
"Art",
"History",
"Geography",
"Sociology",
"Business",
"Political Science",
"Economics",
"Philosophy",
"Mathematics",
"Engineering",
"Environmental Science",
"Agricultural and Food Sciences",
"Education",
"Law",
"Linguistics",
]
]
] = None,
):
# Search for papers
papers = search_semantic_scholar(
query=query,
limit=limit,
fields=fields,
publication_types=publication_types,
open_access=open_access,
min_citation_count=min_citation_count,
date_range=date_range,
year_range=year_range,
fields_of_study=fields_of_study,
)
if not papers:
print("No results found or an error occurred.")
return
# Print results
print_green(f"\nFound {len(papers)} papers matching your query: '{query}'")
for paper in papers:
print(paper)
exit()
def search_paper_by_title(
title: str,
fields: Optional[List[str]] = None
):
"""
Search for a single paper that best matches the given title.
Parameters:
-----------
title : str
The title to search for
fields : List[str], optional
List of fields to include in the response
Returns:
--------
dict or None
JSON data for the best matching paper, or None if no match or error
"""
# Define the API endpoint URL
url = "https://api.semanticscholar.org/graph/v1/paper/search/match"
# Set up default fields if not provided
if fields is None:
fields = [
"title",
"abstract",
"year",
"authors.name",
"externalIds",
"url",
"publicationDate",
"journal",
"citationCount",
"openAccessPdf"
]
# Build query parameters
params = {"query": title, "fields": ",".join(fields)}
# Send the API request
try:
response = requests.get(url, params=params)
response.raise_for_status() # Raise an exception for HTTP errors
return response.json()
except requests.exceptions.HTTPError as e:
if e.response.status_code == 404:
print(f"No paper found matching title: {title}")
return None
else:
print(f"HTTP Error: {e}")
print(f"Response text: {e.response.text}")
return None
except requests.exceptions.RequestException as e:
print(f"Error: {e}")
return None
def get_paper_details(
paper_id: str,
fields: Optional[List[str]] = None
):
"""
Get detailed information about a paper by its identifier.
Parameters:
-----------
paper_id : str
The paper identifier. Can be:
- Semantic Scholar ID (e.g., 649def34f8be52c8b66281af98ae884c09aef38b)
- DOI (e.g., DOI:10.18653/v1/N18-3011)
- arXiv ID (e.g., ARXIV:2106.15928)
- etc.
fields : List[str], optional
List of fields to include in the response
Returns:
--------
dict or None
JSON data for the paper, or None if not found or error
"""
# Define the API endpoint URL
url = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}"
# Set up default fields if not provided
if fields is None:
fields = [
"title",
"abstract",
"year",
"authors.name",
"externalIds",
"url",
"publicationDate",
"journal",
"citationCount",
"openAccessPdf"
]
# Add DOI: prefix if it's a DOI without the prefix
if paper_id.startswith("10.") and "DOI:" not in paper_id:
paper_id = f"DOI:{paper_id}"
# Build query parameters
params = {"fields": ",".join(fields)}
# Send the API request
try:
response = requests.get(url, params=params)
response.raise_for_status() # Raise an exception for HTTP errors
return response.json()
except requests.exceptions.HTTPError as e:
if e.response.status_code == 404:
print(f"No paper found with ID: {paper_id}")
return None
else:
print(f"HTTP Error: {e}")
print(f"Response text: {e.response.text}")
return None
except requests.exceptions.RequestException as e:
print(f"Error: {e}")
return None
if __name__ == "__main__":
main(
query="machine learning",
limit=1,
fields=[
"title",
"url",
"abstract",
"tldr",
"externalIds",
"year",
"influentialCitationCount",
"fieldsOfStudy",
"publicationDate",
"journal",
],
open_access=True,
)