问题 #
希望能批量下载自己的一篇文章被哪些文章所应用,需要知道被引文章的基本信息,包括发表日期,发表期刊等等。
方法 #
使用python和R方案进行解决
首先是python脚本抓取信息
#!/usr/bin/env python
import scholarly
from scholarly import ProxyGenerator
from scholarly import scholarly
# pickle提供了一个简单的持久化功能。可以将对象以文件的形式存放在磁盘上
# python中几乎所有的数据类型(列表,字典,集合,类等)都可以用pickle来序列化
import pickle
# Set up a ProxyGenerator object to use free proxies
# This needs to be done only once per session
# need to register and got the API
pg = ProxyGenerator()
success = pg.ScraperAPI("YOUR API")
# Now search Google Scholar from behind a proxy
scholarly.use_proxy(pg)
search_query = scholarly.search_author('AUTHOR NAME')
first_author_result = next(search_query)
author = scholarly.fill(first_author_result)
first_publication = author['publications'][1]
first_publication_filled = scholarly.fill(first_publication)
# Which papers cited that publication?
citations_all = [citation for citation in scholarly.citedby(first_publication_filled)]
# Save all files into a file
file = open("dictionary_data.pkl", "wb")
pickle.dump(citations_all, file)
file.close()
然后是python脚本整理信息
import pickle
import pandas as pd
file = open("dictionary_data.pkl", "rb")
citations_all = pickle.load(file)
# 提取每个引用字典中的所需信息,构建一个列表
data = []
for citation in citations_all:
bib = citation['bib']
if 'citedby_url' in citation:
citedby_url = citation['citedby_url']
else:
citedby_url = 'NA'
if 'pub_url' in citation:
pub_url = citation['pub_url']
else:
pub_url = 'NA'
if 'eprint_url' in citation:
eprint_url = citation['eprint_url']
else:
eprint_url = 'NA'
# else
if 'num_citations' in citation:
num_citations = citation['num_citations']
else:
num_citations = 'NA'
data.append([
bib['title'],
', '.join(bib['author']),
bib['venue'],
bib['pub_year'],
', '.join(citation['author_id']),
citedby_url,
pub_url,
eprint_url,
num_citations
])
# 构建 DataFrame
columns = ['title', 'author', 'venue', 'pub_year', 'author_id', 'citedby_url', 'pub_url', 'eprint_url', 'num_citations']
df = pd.DataFrame(data, columns=columns)
# 将 DataFrame 保存为 CSV 文件
df.to_csv('citations.csv', index=False)
最后一次修改于 2023-09-03