299 lines
8.5 KiB
JavaScript
299 lines
8.5 KiB
JavaScript
|
{
|
||
|
"translatorID": "56854750-868a-4de0-bfe5-fe075344a121",
|
||
|
"label": "PRC History Review",
|
||
|
"creator": "Bo An",
|
||
|
"target": "^https?://(www\\.)?prchistory\\.org/",
|
||
|
"minVersion": "3.0",
|
||
|
"maxVersion": "",
|
||
|
"priority": 100,
|
||
|
"inRepository": true,
|
||
|
"translatorType": 4,
|
||
|
"browserSupport": "gcsibv",
|
||
|
"lastUpdated": "2021-12-30 18:33:59"
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
***** BEGIN LICENSE BLOCK *****
|
||
|
|
||
|
Copyright © 2021 Bo An
|
||
|
|
||
|
This file is part of Zotero.
|
||
|
|
||
|
Zotero is free software: you can redistribute it and/or modify
|
||
|
it under the terms of the GNU Affero General Public License as published by
|
||
|
the Free Software Foundation, either version 3 of the License, or
|
||
|
(at your option) any later version.
|
||
|
|
||
|
Zotero is distributed in the hope that it will be useful,
|
||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
GNU Affero General Public License for more details.
|
||
|
|
||
|
You should have received a copy of the GNU Affero General Public License
|
||
|
along with Zotero. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
||
|
***** END LICENSE BLOCK *****
|
||
|
*/
|
||
|
|
||
|
function detectWeb(doc, url) {
|
||
|
const articles = getArticles(doc, url);
|
||
|
if (!articles) {
|
||
|
return false;
|
||
|
}
|
||
|
else {
|
||
|
return 'multiple';
|
||
|
}
|
||
|
}
|
||
|
|
||
|
function doWeb(doc, url) {
|
||
|
const articles = getArticles(doc, url);
|
||
|
if (articles) {
|
||
|
Zotero.selectItems(articles, (items) => {
|
||
|
if (!items) {
|
||
|
return true;
|
||
|
}
|
||
|
for (const i in items) {
|
||
|
let isIssue = false;
|
||
|
let isSingleArticle = false;
|
||
|
let isHomePage = isHomePageUrl(url);
|
||
|
// if it's the homepage, treat links as individual journal articles without a dedicated child page, otherwise as issue articles.
|
||
|
if (!isHomePage) {
|
||
|
// if it's not homepage, further check if it is single article issue, or multiple article issue.
|
||
|
isSingleArticle = Object.keys(articles).length === 1;
|
||
|
// when there is only one article on a child page. This usually is the case with a single research paper with its own page. Note this is NOT how many items are selected by the user to add to Zotero.
|
||
|
isIssue = !isSingleArticle;
|
||
|
}
|
||
|
// pass issue information to scraper
|
||
|
scrape(doc, i, isIssue, isSingleArticle);
|
||
|
}
|
||
|
return true;
|
||
|
});
|
||
|
}
|
||
|
}
|
||
|
|
||
|
function getArticles(doc, url) {
|
||
|
let items = {};
|
||
|
let found = false;
|
||
|
|
||
|
// since PRC history review provides downlodable links on both the home page and its children issue pages, we need to:
|
||
|
if (isHomePageUrl(url)) {
|
||
|
const pdfLinkEls = doc.querySelectorAll('a');
|
||
|
pdfLinkEls.forEach((link) => {
|
||
|
const href = link.href;
|
||
|
const hasPDF = isPdfUrl(href);
|
||
|
if (hasPDF) {
|
||
|
items[href] = link.textContent;
|
||
|
if (found === false) {
|
||
|
found = true;
|
||
|
}
|
||
|
}
|
||
|
});
|
||
|
}
|
||
|
else {
|
||
|
const bookLinkEls = doc.querySelectorAll('p, h5');
|
||
|
bookLinkEls.forEach((bookLinkEl) => {
|
||
|
const link = bookLinkEl.querySelectorAll('a')[0];
|
||
|
if (link) {
|
||
|
const href = link.href;
|
||
|
const hasPDF = isPdfUrl(href);
|
||
|
if (hasPDF) {
|
||
|
const title = link.textContent;
|
||
|
|
||
|
if (!title.toLowerCase().includes('here') && title.length > 6) {
|
||
|
items[href] = title;
|
||
|
if (found === false) {
|
||
|
found = true;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
});
|
||
|
}
|
||
|
return found ? items : false;
|
||
|
}
|
||
|
|
||
|
|
||
|
// since PRC history review often directly link to pdf, the url here, unlike press journal articles, is often an PDF link.
|
||
|
function scrape(doc, url, isIssue, isSingleArticle) {
|
||
|
let articleLinkEls;
|
||
|
|
||
|
if (isIssue) {
|
||
|
articleLinkEls = doc.querySelectorAll('p, h5');
|
||
|
}
|
||
|
else {
|
||
|
articleLinkEls = doc.querySelectorAll('h5, h6');
|
||
|
}
|
||
|
|
||
|
// in order to decouple scraping from detection, the same process of finding a journal article link is repeated to access the text info associated with that link. That way, detecting functions do not need to provide any other info than the url.
|
||
|
articleLinkEls.forEach((articleDivEl, index) => {
|
||
|
const linkEl = articleDivEl.querySelectorAll("a")[0];
|
||
|
if (!linkEl) {
|
||
|
return;
|
||
|
}
|
||
|
const href = linkEl.href;
|
||
|
if (href === url) {
|
||
|
const hasPDF = isPdfUrl(href);
|
||
|
if (hasPDF) {
|
||
|
const newItem = new Zotero.Item('journalArticle');
|
||
|
|
||
|
newItem.publicationTitle = "The PRC History Review";
|
||
|
|
||
|
// add title
|
||
|
if (isIssue) {
|
||
|
newItem.title = linkEl.textContent;
|
||
|
}
|
||
|
else if (isSingleArticle) {
|
||
|
const rawTitle = linkEl.textContent;
|
||
|
const rawTitleParts = rawTitle.split(',');
|
||
|
rawTitleParts.shift();
|
||
|
newItem.title = rawTitleParts.join(",").replace(/[”“]/g, '');
|
||
|
}
|
||
|
else {
|
||
|
// if it is homepage link, parse the one-link raw title into title and other info.
|
||
|
const rawTitle = linkEl.textContent;
|
||
|
const rawTitleParts = rawTitle.split(':');
|
||
|
const firstPart = rawTitleParts[0];
|
||
|
|
||
|
const hasSeriesInfo = firstPart && firstPart.toLowerCase().endsWith('series');
|
||
|
if (hasSeriesInfo) {
|
||
|
newItem.seriesTitle = firstPart;
|
||
|
// remove series title, leaving the rest as title;
|
||
|
rawTitleParts.shift();
|
||
|
newItem.title = rawTitleParts.join(":");
|
||
|
}
|
||
|
else {
|
||
|
newItem.title = rawTitle;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// add issue info
|
||
|
if (isIssue) {
|
||
|
const issueInfoEl = doc.querySelectorAll('h5')[0];
|
||
|
if (issueInfoEl) {
|
||
|
const issueInfoArray = issueInfoEl.textContent.split('★');
|
||
|
if (issueInfoArray.length === 3) {
|
||
|
const volume = issueInfoArray[0];
|
||
|
newItem.volume = volume.toLowerCase().replace('volume', '').trim();
|
||
|
const issue = issueInfoArray[1];
|
||
|
newItem.issue = issue.toLowerCase().replace('number', '').trim();
|
||
|
const date = issueInfoArray[2];
|
||
|
newItem.date = ZU.strToISO(date);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
else if (isSingleArticle) {
|
||
|
// if it's single article page, the issue info is two divs above the link div.
|
||
|
const issueInfoElIndex = index - 2;
|
||
|
if (issueInfoElIndex >= 0) {
|
||
|
const issueInfoEl = doc.querySelectorAll('h5')[issueInfoElIndex];
|
||
|
if (issueInfoEl) {
|
||
|
const issueInfoArray = issueInfoEl.textContent.split('★');
|
||
|
if (issueInfoArray.length === 3) {
|
||
|
const volume = issueInfoArray[0];
|
||
|
newItem.volume = volume.toLowerCase().replace('volume', '').trim();
|
||
|
const issue = issueInfoArray[1];
|
||
|
newItem.issue = issue.toLowerCase().replace('number', '').trim();
|
||
|
const date = issueInfoArray[2];
|
||
|
newItem.date = ZU.strToISO(date);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
// if it's homepage article, the issue info is in the previous div.
|
||
|
const lastIndex = index - 1;
|
||
|
if (lastIndex >= 0) {
|
||
|
const issueInfoEl = articleLinkEls[lastIndex];
|
||
|
const issueInfoArray = issueInfoEl.textContent.split('★');
|
||
|
if (issueInfoArray.length === 2) {
|
||
|
const issue = issueInfoArray[0];
|
||
|
newItem.issue = issue.toLowerCase().replace('number', '').trim();
|
||
|
const date = issueInfoArray[1];
|
||
|
newItem.date = ZU.strToISO(date);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// add author
|
||
|
// for issue authors
|
||
|
if (isIssue) {
|
||
|
const authorEl = articleLinkEls[index + 1];
|
||
|
if (authorEl) {
|
||
|
// in case there are multiple authors
|
||
|
const authorTexts = authorEl.textContent.trim().split(' and ');
|
||
|
authorTexts.forEach((authorText) => {
|
||
|
const authorName = authorText.split(',')[0];
|
||
|
if (authorName) {
|
||
|
newItem.creators.push(ZU.cleanAuthor(authorName, 'author', false));
|
||
|
}
|
||
|
});
|
||
|
}
|
||
|
}
|
||
|
// for single article issue's author information
|
||
|
if (isSingleArticle) {
|
||
|
const rawAuthorText = linkEl.textContent.split(',')[0];
|
||
|
if (rawAuthorText) {
|
||
|
const authorTexts = rawAuthorText.trim().split(' and ');
|
||
|
authorTexts.forEach((authorText) => {
|
||
|
const authorName = authorText.split(',')[0];
|
||
|
if (authorName) {
|
||
|
newItem.creators.push(ZU.cleanAuthor(authorName, 'author', false));
|
||
|
}
|
||
|
});
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Download pdf
|
||
|
const pdfUrl = url;
|
||
|
if (pdfUrl && isPdfUrl) {
|
||
|
newItem.attachments.push({
|
||
|
url: pdfUrl,
|
||
|
mimeType: "application/pdf",
|
||
|
});
|
||
|
}
|
||
|
newItem.complete();
|
||
|
}
|
||
|
}
|
||
|
});
|
||
|
}
|
||
|
|
||
|
// helper functions.
|
||
|
function isPdfUrl(url) {
|
||
|
return url.toLowerCase().endsWith('.pdf');
|
||
|
}
|
||
|
|
||
|
function isHomePageUrl(url) {
|
||
|
return url.endsWith('the-prc-history-review/');
|
||
|
}
|
||
|
|
||
|
|
||
|
/** BEGIN TEST CASES **/
|
||
|
var testCases = [
|
||
|
{
|
||
|
"type": "web",
|
||
|
"url": "http://prchistory.org/review-october-2021/",
|
||
|
"items": "multiple"
|
||
|
},
|
||
|
{
|
||
|
"type": "web",
|
||
|
"url": "http://prchistory.org/the-prc-history-review-5-2/",
|
||
|
"items": "multiple"
|
||
|
},
|
||
|
{
|
||
|
"type": "web",
|
||
|
"url": "http://prchistory.org/review-october-2017/",
|
||
|
"items": "multiple"
|
||
|
},
|
||
|
{
|
||
|
"type": "web",
|
||
|
"url": "http://prchistory.org/review-april-2017/",
|
||
|
"items": "multiple"
|
||
|
},
|
||
|
{
|
||
|
"type": "web",
|
||
|
"url": "http://prchistory.org/issue_6_3/",
|
||
|
"items": "multiple"
|
||
|
}
|
||
|
]
|
||
|
/** END TEST CASES **/
|