首页
学习
活动
专区
工具
TVP
发布
社区首页 >问答首页 >使用Zotero翻译器获取作者归属

使用Zotero翻译器获取作者归属
EN

Stack Overflow用户
提问于 2015-10-22 13:32:35
回答 1查看 256关注 0票数 0

我目前正在做一个项目,在这个项目中,我需要从dblp上发表的文章中获得作者的联系。因此,我正在设置一个翻译服务器,您可以从他们的github中获得翻译服务器,并按照其他说明进行操作。

然后,我在我的Java程序中建立了这样的连接:

代码语言:javascript
复制
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import org.json.*;


public class ZoteroHandler 
{

//Function runing the scan
public static void Scan(Article article) throws Exception
{
    //Setting up an URL HttpURLConnection given DOI
    URL urlDoi = new URL (article.GetElectronicEdition());
    HttpURLConnection connDoi = (HttpURLConnection) urlDoi.openConnection();

    // Make the logic below easier to detect redirections
    connDoi.setInstanceFollowRedirects(false);  

    String doi = "{\"url\"Smiley unsure"" + connDoi.getHeaderField("Location") + "\",\"sessionid\"Smiley unsure"abc123\"}";

    //Setting up an URL to translation-server
    URL url = new URL("http://127.0.0.1:1969/web");
    URLConnection conn = url.openConnection();

    conn.setDoOutput(true);
    conn.setRequestProperty("Content-Type", "application/json");

    OutputStreamWriter writer = new OutputStreamWriter(conn.getOutputStream());

    writer.write(doi);
    writer.flush();

    String line;
    BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));

    while ((line = reader.readLine()) != null ) 
    {
        //Used to see of we get something from stream
        System.out.println(line);

        //Incoming is JSONArray, so create new array, fill it then parse it 
        JSONArray jsonArr = new JSONArray(line);
        JSONObject obj = jsonArr.getJSONObject(0);

        //Getting abstracts
        String abstracts = obj.getString("abstractNote");
        System.out.println(abstracts);

        //Setting information in db
        article.SetAbstracts(abstracts);
        DatabaseHandler.GetInstance().UpdateArticle(article);

    }

    writer.close(); 
    reader.close(); 

    //Need to disconnect?
    //((HttpURLConnection) conn).disconnect();
    //connDoi.disconnect();
}

到目前为止还不错。我正在获取所需的信息,并将其存储在抽象字符串中,并将其设置为out数据库。但现在我也需要找到作者的归属。所以我需要以某种方式修改我使用的翻译脚本。

这是一个脚本:

代码语言:javascript
复制
    {
    "translatorID": "5af42734-7cd5-4c69-97fc-bc406999bdba",
    "label": "Atypon Journals",
    "creator": "Sebastian Karcher",
    "target": "^https?://[^?#]+(?:/doi/((?:abs|abstract|full|figure|ref|citedby|book)/)?10\\.|/action/doSearch\\?)|^https?://[^/]+/toc/",
    "minVersion": "3.0",
    "maxVersion": "",
    "priority": 270,
    "inRepository": true,
    "translatorType": 4,
    "browserSupport": "gcsibv",
    "lastUpdated": "2015-10-15 22:24:05"
}

/*
Atypon Journals Translator
Copyright (C) 2011-2014 Sebastian Karcher

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/


function detectWeb(doc, url) 
{
    if (url.search(/^https?:\/\/[^\/]+\/toc\/|\/action\/doSearch\?/) != -1) 
    {
        return getSearchResults(doc, true) ? "multiple" : false;
    }

    var citLinks = ZU.xpath(doc, '//a[contains(@href, "/action/showCitFormats")]');

    if (citLinks.length > 0) {
        if (url.indexOf('/doi/book/') != -1) {
            return 'book';
        }
        else if (url.search(/\.ch\d+$/)!=-1){
            return 'bookSection';
        }
        return "journalArticle";
    }
}

function getSearchResults(doc, checkOnly, extras) {
    var articles = {};
    var container = doc.getElementsByName('frmSearchResults')[0]
        || doc.getElementsByName('frmAbs')[0];
    if (!container) {
        Z.debug('Atypon: multiples container not found.');
        return false;
    }
    var rows = container.getElementsByClassName('articleEntry'),
        found = false,
        doiLink = 'a[contains(@href, "/doi/abs/") or contains(@href, "/doi/abstract/") or '
            + 'contains(@href, "/doi/full/") or contains(@href, "/doi/book/")]';
    for (var i = 0; i<rows.length; i++) {
        var title = rows[i].getElementsByClassName('art_title')[0];
        if (!title) continue;
        title = ZU.trimInternal(title.textContent);

        var urlRow = rows[i];
        var url = ZU.xpathText(urlRow, '(.//' + doiLink + ')[1]/@href');

        if (!url) {
            // e.g. http://pubs.rsna.org/toc/radiographics/toc/33/7 shows links in adjacent div
            urlRow = rows[i].nextElementSibling;
            if (!urlRow || urlRow.classList.contains('articleEntry')) continue;

            url = ZU.xpathText(urlRow, '(.//' + doiLink + ')[1]/@href');
        }
        if (!url) continue;

        if (checkOnly) return true;
        found = true;

        if (extras) {
            extras[url] = { pdf: buildPdfUrl(url, urlRow) };
        }

        articles[url] = title;
    }

    if (!found){
        Z.debug("Trying an alternate multiple format");
        var rows = container.getElementsByClassName("item-details");
        for (var i = 0; i<rows.length; i++) {
            var title = ZU.xpathText(rows[i], './h3');
            if (!title) continue;
            title = ZU.trimInternal(title);

            var url = ZU.xpathText(rows[i], '(.//ul[contains(@class, "icon-list")]/li/'
                + doiLink + ')[1]/@href');
            if (!url) continue;

            if (checkOnly) return true;
            found = true;

            if (extras) {
                extras[url] = { pdf: buildPdfUrl(url, rows[i]) };
            }

            articles[url] = title;
        }
    }

    return found ? articles : false;
}

// Keep this in line with target regexp
var replURLRegExp = /\/doi\/((?:abs|abstract|full|figure|ref|citedby|book)\/)?/;

function buildPdfUrl(url, root) {
    if (!replURLRegExp.test(url)) return false; // The whole thing is probably going to fail anyway

    var pdfPaths = ['/doi/pdf/', '/doi/pdfplus/'];
    for (var i=0; i<pdfPaths.length; i++) {
        if (ZU.xpath(root, './/a[contains(@href, "' + pdfPaths[i] + '")]').length) {
            return url.replace(replURLRegExp, pdfPaths[i]);
        }
    }

    Z.debug('PDF link not found.')
    if (root.nodeType != 9 /*DOCUMENT_NODE*/) {
        Z.debug('Available links:');
        var links = root.getElementsByTagName('a');
        if (!links.length) Z.debug('No links');
        for (var i=0; i<links.length; i++) {
            Z.debug(links[i].href);
        }
    }

    return false;
}

function doWeb(doc, url) {
    if (detectWeb(doc, url) == "multiple") {
        var extras = {};
        Zotero.selectItems(getSearchResults(doc, false, extras), function (items) {
            if (!items) {
                return true;
            }
            var articles = [];
            for (var itemurl in items) {
                articles.push({
                    url: itemurl.replace(/\?prev.+/, ""),
                    extras: extras[itemurl]
                });
            }

            fetchArticles(articles);
        });

    } else {
        scrape(doc, url, {pdf: buildPdfUrl(url, doc)});
    }
}

function fixCase(str, titleCase) {
    if (str.toUpperCase() != str) return str;

    if (titleCase) {
        return ZU.capitalizeTitle(str, true);
    }

    return str.charAt(0) + str.substr(1).toLowerCase();
}

function fetchArticles(articles) {
    if (!articles.length) return;

    var article = articles.shift();
    ZU.processDocuments(article.url, function(doc, url) {
        scrape(doc, url, article.extras);
    },
    function() {
        if (articles.length) fetchArticles(articles);
    });
}

function scrape(doc, url, extras) {
    url = url.replace(/[?#].*/, "");
    var doi = url.match(/10\.[^?#]+/)[0];
    var citationurl = url.replace(replURLRegExp, "/action/showCitFormats?doi=");
    var abstract = doc.getElementsByClassName('abstractSection')[0];
    //var authorAffiliation = doc.getElementsByClassName('listGroup')[0];
    var tags = ZU.xpath(doc, '//p[@class="fulltext"]//a[contains(@href, "keyword") or contains(@href, "Keyword=")]');
    Z.debug("Citation URL: " + citationurl);
    ZU.processDocuments(citationurl, function(citationDoc){
        var filename = citationDoc.evaluate('//form//input[@name="downloadFileName"]', citationDoc, null, XPathResult.ANY_TYPE, null).iterateNext().value;
        Z.debug("Filename: " + filename);
        var get = '/action/downloadCitation';
        var post = 'doi=' + doi + '&downloadFileName=' + filename + '&format=ris&direct=true&include=cit';

        ZU.doPost(get, post, function (text) 
        {
            //Z.debug(text);
            var translator = Zotero.loadTranslator("import");

            // Calling the RIS translator
            translator.setTranslator("32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7");
            translator.setString(text);
            translator.setHandler("itemDone", function (obj, item) 
            {
                // Sometimes we get titles and authros in all caps
                item.title = fixCase(item.title);

                for (var i=0; i<item.creators.length; i++) 
                {
                    item.creators[i].lastName = fixCase(item.creators[i].lastName, true);
                    if (item.creators[i].firstName) {
                        item.creators[i].firstName = fixCase(item.creators[i].firstName, true);
                }
            }

                item.url = url;
                //for Emerald, get rid of the "null" that they add at the end of every title:
                if (url.indexOf("www.emeraldinsight.com")!=-1){
                    item.title = item.title.replace(/null$/, "")
                }
                item.notes = [];

                for (var i in tags)
                {
                    item.tags.push(tags[i].textContent)
                }

                if (abstract) 
                {
                    // Drop "Abstract" prefix
                    // This is not excellent, since some abstracts could
                    // conceivably begin with the word "abstract"
                    item.abstractNote = abstract.textContent
                        .replace(/^\s*abstract\s*/i, '');
                }

                item.attachments = [];
                if (extras.pdf) {
                    item.attachments.push({
                        url: extras.pdf,
                        title: "Full Text PDF",
                        mimeType: "application/pdf"
                    });
                }

                item.attachments.push({
                    document: doc,
                    title: "Snapshot",
                    mimeType: "text/html"
                });
                item.libraryCatalog = url.replace(/^https?:\/\/(?:www\.)?/, '')
                    .replace(/[\/:].*/, '') + " (Atypon)";
                item.complete();
            });
            translator.translate();
        });
    })
}

那么,有没有人能告诉我,我需要如何更新脚本,这样我才能获得作者的隶属关系?我知道脚本应该转到HTML "ListGroup“来查找auhtor从属关系。

如果您需要更多信息,这里是指向所有可用的这里和关于佐特罗的zotero翻译器的链接

EN

回答 1

Stack Overflow用户

回答已采纳

发布于 2015-10-24 05:43:12

我通过这样做解决了问题:

代码语言:javascript
复制
function scrape(doc, url, extras) {
    url = url.replace(/[?#].*/, "");
    var doi = url.match(/10\.[^?#]+/)[0];
    var citationurl = url.replace(replURLRegExp, "/action/showCitFormats?doi=");

    //TESTING
    var affiliations = [];
    var affiliation = doc.getElementsByClassName('listGroup');    


    var abstract = doc.getElementsByClassName('abstractSection')[0];

    var tags = ZU.xpath(doc, '//p[@class="fulltext"]//a[contains(@href, "keyword") or contains(@href, "Keyword=")]');

    Z.debug("Citation URL: " + citationurl);

    ZU.processDocuments(citationurl, function(citationDoc){
            var filename = citationDoc.evaluate('//form//input[@name="downloadFileName"]', citationDoc, null, XPathResult.ANY_TYPE, null).iterateNext().value;
            Z.debug("Filename: " + filename);
            var get = '/action/downloadCitation';
            var post = 'doi=' + doi + '&downloadFileName=' + filename + '&format=ris&direct=true&include=cit';

            ZU.doPost(get, post, function (text) {
                    //Z.debug(text);
                    var translator = Zotero.loadTranslator("import");

                    // Calling the RIS translator
                    translator.setTranslator("32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7");
                    translator.setString(text);
                    translator.setHandler("itemDone", function (obj, item) {

                            // Sometimes we get titles and authros in all caps
                            item.title = fixCase(item.title);

                            for (var i=0; i<item.creators.length; i++) {
                                    item.creators[i].lastName = fixCase(item.creators[i].lastName, true);

                                    if (item.creators[i].firstName) {
                                            item.creators[i].firstName = fixCase(item.creators[i].firstName, true);

                                    }
                            }

                            item.url = url;
                            //for Emerald, get rid of the "null" that they add at the end of every title:
                            if (url.indexOf("www.emeraldinsight.com")!=-1){
                                    item.title = item.title.replace(/null$/, "")
                            }
                            item.notes = [];
                            for (var i in tags){
                                    item.tags.push(tags[i].textContent)
                            }

                            if (abstract) {
                                    // Drop "Abstract" prefix
                                    // This is not excellent, since some abstracts could
                                    // conceivably begin with the word "abstract"
                                    item.abstractNote = abstract.textContent
                                            .replace(/^\s*abstract\s*/i, '');
                            }

                            item.attachments = [];
                            if (extras.pdf) {
                                    item.attachments.push({
                                            url: extras.pdf,
                                            title: "Full Text PDF",
                                            mimeType: "application/pdf"
                                    });
                            }

                            item.attachments.push({
                                    document: doc,
                                    title: "Snapshot",
                                    mimeType: "text/html"
                            });
                            item.libraryCatalog = url.replace(/^https?:\/\/(?:www\.)?/, '')
                                    .replace(/[\/:].*/, '') + " (Atypon)";


                            //Affiliations 
                            for (i=0; i<affiliations.length; i++)
                            {
                                    affiliation.push(affiliations[i].textContent)
                            }
                            item.extra = affiliation.join("; ");


                            item.complete();
                    });
                    translator.translate();
            });
    })

我创建了一个名为附属关系的数组和一个称为从属关系的变量。然后,我将得到的字符串填充到数组中,并将其存储在Zotero中的一个名为extra的字段中,这是因为Zotero没有一个特殊的author关联字段。所以这只是一次黑客攻击,这样我就可以得到我的项目的归属了。

票数 0
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/33282289

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档