<html>
  <head>
    <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
  </head>
  <body bgcolor="#FFFFFF" text="#000000">
    corrija j1 para j1 = "<a moz-do-not-send="true"
      class="moz-txt-link-abbreviated" href="http://www.nature.com/">www.nature.com</a>"
    sem a barra "/"<br>
    <br>
    <div class="moz-cite-prefix">Em 27/05/2016 16:08, salah escreveu:<br>
    </div>
    <blockquote cite="mid:57489B45.4070001@gmail.com" type="cite">
      <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
      Caro Elias<br>
      <br>
      Você pode baixar o pacote fonte metagear (a beleza do opensource)
      aqui: <br>
      <br>
      <a moz-do-not-send="true" class="moz-txt-link-freetext"
href="https://cran.r-project.org/src/contrib/Archive/metagear/metagear_0.1.tar.gz">https://cran.r-project.org/src/contrib/Archive/metagear/metagear_0.1.tar.gz</a><br>
      <br>
      Examinando a função PDF_download() notamos que ela usa três
      arquivos: <br>
      PDF_download.R, PDF_utils.R, PDF_collect.R e isPDF.R<br>
      <br>
      para o doi "10.1371/journal.pone.0123511" a função está extraindo
      como alvo:<br>
      <br>
      "content=\<a moz-do-not-send="true" class="moz-txt-link-rfc2396E"
href="http://journals.plos.org/plosone/article/asset?id=10.1371/journal.pone.0123511.PDF">"http://journals.plos.org/plosone/article/asset?id=10.1371/journal.pone.0123511.PDF"</a><br>
      <br>
      e o correto seria o link com atributo href:<br>
      <br>
"href=\"/plosone/article/asset?id=10.1371%2Fjournal.pone.0123511.PDF"<br>
      <br>
      resolvendo em:
      <a moz-do-not-send="true" class="moz-txt-link-rfc2396E"
href="http://journals.plos.org/plosone/article/asset?id=10.1371%2Fjournal.pone.0123511.PDF">"http://journals.plos.org/plosone/article/asset?id=10.1371%2Fjournal.pone.0123511.PDF"</a><br>
      <br>
      Segue um pequeníssimo ROBOT, veja que ele não prevê https e
      necessita das urls dos jornais alvo.<br>
      <br>
      # Author: salah<br>
###############################################################################<br>
      require(RCurl)<br>
      require(XML)<br>
      <br>
      rm(list=ls())<br>
      <br>
      ## seleciona o link alvo e converte em caracter<br>
      baixaURL = function(doi)<br>
      {<br>
          ## link doi<br>
          urlDOI = paste0(<a moz-do-not-send="true"
        class="moz-txt-link-rfc2396E" href="http://dx.doi.org/">"http://dx.doi.org/"</a>,
      doi)<br>
          <br>
          ## download url<br>
          web = getURLContent(urlDOI)<br>
          tc = textConnection(web)<br>
          web = readLines(tc)<br>
          close(tc)<br>
          <br>
          ## captura os links<br>
          lnk = getHTMLLinks(web)<br>
          <br>
          ## converte para chr<br>
          doc = htmlParse(lnk)<br>
          w = capture.output(doc)<br>
          <br>
          return(w)    <br>
      }##end baixaURL<br>
      <br>
      ## prepara a url para download<br>
      urlPDF = function(w, journal)<br>
      {<br>
          a = unlist(strsplit(w, split = "\\s"))<br>
          b = unique(grep("pdf|PDF", a, value = TRUE))<br>
          cc = grep("href+", b, value = TRUE)<br>
          d = gsub("href\\=|\"|>", "", cc)<br>
          h = gsub(".*pdf|.*PDF", "", d)<br>
          outPDF = gsub(h[1], "", d)<br>
          <br>
          return(paste0(<a moz-do-not-send="true"
        class="moz-txt-link-rfc2396E" href="http://">"http://"</a>,
      journal, outPDF))<br>
      }##end naturePDF<br>
      <br>
      ## faz o download do pdf<br>
      baixaPDF = function(urls, dest = "~")<br>
      {<br>
          j = unlist(strsplit(urls, "/"))<br>
          namePDF = grep(".pdf|.PDF", j, value = T)    <br>
          <br>
          ## se houver pdf extra<br>
          for(i in 1:length(urls))<br>
          {<br>
              dir_name = paste0(dest, namePDF[i])<br>
              download.file(urls[i], dir_name)<br>
          }##end for<br>
      }##end baixaPDF<br>
      <br>
      doiNature = c("10.1038/nutd.2016.11", "10.1038/srep17841",
      "10.1038/srep25762")<br>
      doiPLOS = c("10.1371/journal.pone.0123511",
      "10.1371/journal.pbio.1002461")<br>
      <br>
      j1 = "<a moz-do-not-send="true" class="moz-txt-link-abbreviated"
        href="http://www.nature.com/">www.nature.com/</a>"<br>
      j2 = "journals.plos.org"<br>
      <br>
      DATA = data.frame(DOI = c(doiNature, doiPLOS), Journal = c(j1, j1,
      j1, j2, j2), stringsAsFactors = FALSE)<br>
      <br>
      destDir = "~/"<br>
      <br>
      for(n in 1:nrow(DATA))<br>
      {<br>
          w = baixaURL(DATA$DOI[n])<br>
          urls = urlPDF(w, DATA$Journal[n])<br>
          print(urls)<br>
          baixaPDF(urls, destDir) ## faz o download<br>
      }<br>
      <br>
      <br>
      saudações<br>
      <br>
      <div class="moz-cite-prefix">Em 22/05/2016 23:23, Elias Carvalho
        escreveu:<br>
      </div>
      <blockquote
cite="mid:CAH8E6Ut9hxxc5CyYbkbPFnF=UTvTyfxG6-MRQkO9=8BW83EtLA@mail.gmail.com"
        type="cite">
        <div dir="ltr">Desculpe Leonardo, segue o código mínimo, tem
          dois DOIs para poder testar:
          <div><br>
          </div>
          <div><b>library(metagear)<br>
            </b></div>
          <div><b><br>
            </b></div>
          <div>
            <div><b>DOI <- "10.1371/journal.pone.0123511" #
                Disponível em: <a moz-do-not-send="true"
                  href="http://goo.gl/rhtvFx">http://goo.gl/rhtvFx</a></b></div>
            <div><b># DOI <- "10.1038/srep17841"            #
                Disponível em: <a moz-do-not-send="true"
                  href="http://goo.gl/rNwOpk">http://goo.gl/rNwOpk</a></b></div>
            <div><b>filename <- "ArtigoTeste.pdf"<br>
              </b></div>
            <div><b>folder <- "~"<br>
              </b></div>
            <div><b>PDF_download(DOI, directory = folder, theFileName =
                filename, validatePDF = TRUE, quiet = FALSE)</b><br>
            </div>
            <div> </div>
          </div>
          <div><br>
          </div>
        </div>
        <div class="gmail_extra"><br>
          <div class="gmail_quote">Em 22 de maio de 2016 09:41, Elias
            Carvalho <span dir="ltr"><<a moz-do-not-send="true"
                href="mailto:ecacarva@gmail.com" target="_blank">ecacarva@gmail.com</a>></span>
            escreveu:<br>
            <blockquote class="gmail_quote" style="margin:0 0 0
              .8ex;border-left:1px #ccc solid;padding-left:1ex">
              <div dir="ltr">Bom dia Pessoal
                <div><br>
                </div>
                <div>Estou baixando tentando baixar alguns pdf de
                  artigos científicos pelo DOI usando o pacote metagear,
                  porém a maioria apresenta a mensagem "failed, no valid
                  url links detected", ou</div>
                <div>"cannot open: HTTP status was '404 Not Found' PDF
                  download... skipped", mesmo ligando o proxy da
                  universidade.<br>
                </div>
                <div><br>
                </div>
                <div>De 1800 artigos que preciso consegui por volta de
                  uns 200 download apenas.</div>
                <div><br>
                </div>
                <div>Alguém poderia me dar uma ajuda em como resolver
                  este problema ?<span class="HOEnZb"><font
                      color="#888888"><br clear="all">
                      <div><br>
                      </div>
                      -- <br>
                      <div>
                        <div dir="ltr">
                          <div>
                            <div dir="ltr">
                              <div>
                                <div dir="ltr">
                                  <div dir="ltr">Best regards... 8^)<br>
                                    <br>
                                    “<span style="font-size:12.8px">The
                                      mind that is open to new ideas
                                      never come back</span>
                                    <div><span style="font-size:12.8px">to

                                        its original size</span>”  <i
style="margin:0px;padding:0px;border:0px;font-style:inherit;font-variant:inherit;font-weight:inherit;font-stretch:inherit;font-size:inherit;vertical-align:baseline;color:rgb(51,51,51);font-family:Arial,Verdana,sans-serif;line-height:21px;background-color:rgb(255,255,255)">Albert

                                        Einstein</i><span
style="font-size:13px;color:rgb(51,51,51);font-family:Arial,Verdana,sans-serif;line-height:21px;background-color:rgb(255,255,255)"> </span><br>
                                      <br>
                                      <br>
                                      <span style="font-size:small">-- </span><br
                                        style="font-size:small">
                                      <div style="font-size:small">
                                        <div dir="ltr">
                                          <div dir="ltr">
                                            <div dir="ltr">
                                              <div>Obrigado</div>
                                              <div>Elias</div>
                                            </div>
                                          </div>
                                        </div>
                                      </div>
                                    </div>
                                  </div>
                                </div>
                              </div>
                            </div>
                          </div>
                        </div>
                      </div>
                    </font></span></div>
              </div>
            </blockquote>
          </div>
          <br>
          <br clear="all">
          <div><br>
          </div>
          -- <br>
          <div class="gmail_signature">
            <div dir="ltr">
              <div>
                <div dir="ltr">
                  <div>
                    <div dir="ltr">
                      <div dir="ltr">Best regards... 8^)<br>
                        <br>
                        “<span style="font-size:12.8px">The mind that is
                          open to new ideas never come back</span>
                        <div><span style="font-size:12.8px">to its
                            original size</span>”  <i
style="margin:0px;padding:0px;border:0px;font:inherit;vertical-align:baseline;color:rgb(51,51,51);font-family:Arial,Verdana,sans-serif;line-height:21px;background-color:rgb(255,255,255)">Albert

                            Einstein</i><span
style="font-size:13px;color:rgb(51,51,51);font-family:Arial,Verdana,sans-serif;line-height:21px;background-color:rgb(255,255,255)"> </span><br>
                          <br>
                          <br>
                          <span style="font-size:small">-- </span><br
                            style="font-size:small">
                          <div style="font-size:small">
                            <div dir="ltr">
                              <div dir="ltr">
                                <div dir="ltr">
                                  <div>Obrigado</div>
                                  <div>Elias</div>
                                </div>
                              </div>
                            </div>
                          </div>
                        </div>
                      </div>
                    </div>
                  </div>
                </div>
              </div>
            </div>
          </div>
        </div>
        <br>
        <fieldset class="mimeAttachmentHeader"></fieldset>
        <br>
        <pre wrap="">_______________________________________________
R-br mailing list
<a moz-do-not-send="true" class="moz-txt-link-abbreviated" href="mailto:R-br@listas.c3sl.ufpr.br">R-br@listas.c3sl.ufpr.br</a>
<a moz-do-not-send="true" class="moz-txt-link-freetext" href="https://listas.inf.ufpr.br/cgi-bin/mailman/listinfo/r-br">https://listas.inf.ufpr.br/cgi-bin/mailman/listinfo/r-br</a>
Leia o guia de postagem (<a moz-do-not-send="true" class="moz-txt-link-freetext" href="http://www.leg.ufpr.br/r-br-guia">http://www.leg.ufpr.br/r-br-guia</a>) e forne�a c�digo m�nimo reproduz�vel.</pre>
      </blockquote>
      <br>
    </blockquote>
    <br>
  </body>
</html>