User:John Vandenberg/google books scrape text.user.js

From Wikisource
Jump to navigation Jump to search
Note: After saving, changes may not occur immediately. Click here to learn how to bypass your browser's cache.
  • Firefox / Safari: Hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (Cmd-R on a Mac)
  • Google Chrome: Press Ctrl-Shift-R (Cmd-Shift-R on a Mac)
  • Internet Explorer: Hold Ctrl while clicking Refresh, or press Ctrl-F5
  • Opera: Clear the cache in Tools → Preferences

For details and instructions about other browsers, see Wikipedia:Bypass your cache.

// Copyright (C) John Mark Vandenberg - jayvdb@gmail.com
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
// http://www.gnu.org/copyleft/gpl.html
//
// --------------------------------------------------------------------
//
// This is a Greasemonkey user script.
//
// To install, you need Greasemonkey: http://greasemonkey.mozdev.org/
// Then restart Firefox and revisit this script.
// Under Tools, there will be a new menu item to "Install User Script".
// Accept the default configuration and install.
//
// To uninstall, go to Tools/Manage User Scripts,
// select "Google Books Accessible", and click Uninstall.
//
// --------------------------------------------------------------------
//
// ==UserScript==
// @name          Google Books Scrape Text
// @description   Adds a "Copy" button on "Full View" book to copy text to clipboard.
//                The config option 'signed.applets.codebase_principal_support' must
//                be set to true; see http://www.krikkit.net/howto_javascript_copy_clipboard.html
// @include       http://books.google.com*/books?*pg=*
// ==/UserScript==

function copy_clip (meintext) {
   // Copyright (C) krikkit - krikkit@gmx.net
   // http://www.krikkit.net/howto_javascript_copy_clipboard.html
   if (window.clipboardData) 
   {
      // the IE-manier
      window.clipboardData.setData("Text", meintext);

      // waarschijnlijk niet de beste manier om Moz/NS te detecteren;
      // het is mij echter onbekend vanaf welke versie dit precies werkt:
   }
   else if (unsafeWindow.netscape) 
   { 
      // dit is belangrijk maar staat nergens duidelijk vermeld:
      // you have to sign the code to enable this, or see notes below 
      unsafeWindow.netscape.security.PrivilegeManager.enablePrivilege('UniversalXPConnect');

      // maak een interface naar het clipboard
      var clip = Components.classes['@mozilla.org/widget/clipboard;1']
                 .createInstance(Components.interfaces.nsIClipboard);
      if (!clip) return;

      // maak een transferable
      var trans = Components.classes['@mozilla.org/widget/transferable;1']
                  .createInstance(Components.interfaces.nsITransferable);

      if (!trans) return;

      // specificeer wat voor soort data we op willen halen; text in dit geval
      trans.addDataFlavor('text/unicode');

      // om de data uit de transferable te halen hebben we 2 nieuwe objecten 
      // nodig om het in op te slaan
      var str = new Object();
      var len = new Object();

      var str = Components.classes["@mozilla.org/supports-string;1"]
                   .createInstance(Components.interfaces.nsISupportsString);

      var copytext=meintext;

      str.data=copytext;

      trans.setTransferData("text/unicode",str,copytext.length*2);

      var clipid=Components.interfaces.nsIClipboard;

      if (!clip) return false;

      clip.setData(trans,null,clipid.kGlobalClipboard);
   }
   else {
      return false;
   }

   return true;
}
// ====


function getTextContainer () {
   var container_table = document.getElementById('container');

   var node = container_table.firstChild.childNodes[3].childNodes[1].firstChild;

   node.id = 'text_container';
   return node;
}

function GetAccessibleHTML () {
   var text_container = getTextContainer ();
   return text_container.innerHTML;
}

var extract_pagetext_re = /"Content":"(.*)","Flags":[0-9]*}/;
function CopyText () {
   var text;
   var page_number;
   var text_mode_a = document.getElementById('text_mode');

   if (text_mode_a) {
      var text_mode_url = text_mode_a.childNodes[0].getAttribute('href');
      GM_log('found url : ' + text_mode_url);

      if (/pg=([a-zA-Z0-9\-_]*)&/.exec (text_mode_url) ) {
         GM_log('found page num : ' + RegExp.$1);
         page_number = RegExp.$1;
      }
   }
   else {
      page_number = 'PA' + document.getElementById ('jtp').value
   }

   if (output_mode == 'html_text') {
      text = GetAccessibleHTML();
   }
   else { //if (output_mode == 'images') {
      if (!unsafeWindow['copytext_page' + page_number]) {
         var jstext_url = location.href.replace(/#.*/,'')
                                       .replace(/(pg|output)=[^&]*/,'') 
                          + '&pg=' + page_number 
                          + '&jscmd=click2&output=text';

         GM_log('fetching ' + jstext_url);

         GM_xmlhttpRequest({
            method: 'GET',
            url: jstext_url,
            onload: function(responseDetails) {
               if (responseDetails.status != 200) {
                  GM_log('google books status: ' + responseDetails.status);
                  return;
               }

               if (!extract_pagetext_re.exec(responseDetails.responseText) ) {
                  GM_log('google books page text not found: ' + responseDetails.responseText);
                  return;
               }

               var text = RegExp.$1;
	       text = text.replace(/\\"/g,'"').replace(/\\n/gm,'\n');

               unsafeWindow['copytext_page' + page_number] = text;
               CopyText();
            }
         });
	 return;
      }

      text = unsafeWindow['copytext_page' + page_number];
   }

   // format it as either text, lines (formatted) or leave it as html
   var method = document.getElementById ('copytext_method').value;
   if (method != 'html')
      text = text.replace('<br>', '\n', 'g').replace(/<[\/]*p>/g, '');

   if (method == 'text')
      text = text.replace('\n', ' ', 'g');

   if ( copy_clip ( text ) ) {
      var button;
      if (this && this.type && this.type == 'BUTTON')
          button = this
      else
          button = document.getElementById ('copytext');

      button.innerHTML = "Copied";
      button.disabled = true;
      window.setTimeout(RestoreCopyTextButton, 2000);
   }
   return false;
}

function RestoreCopyTextButton() {
   var button = document.getElementById('copytext');

   if (!button) return;
   button.disabled = false;
}

function ResetCopyTextButton() {
   var button = document.getElementById('copytext');

   if (!button) return;

   button.innerHTML = 'Copy';
   button.disabled = false;
}

function AddCopyAction () {
   var pagecontrol = document.getElementById('pagecontrol');
   var jtp_form = document.getElementById('jtp_form');
   var arrows = pagecontrol.getElementsByTagName('A');

   if (!jtp_form) return; // probably not full view

   var toolbar = document.getElementById('toolbar');
   toolbar.parentNode.setAttribute('width', '20%');
   toolbar.parentNode.previousSibling.setAttribute('width', '25%');

   var regularview = document.getElementById('regular_view');
   // shorten from "Exit from fullscreen" to "Exit"
   if (regularview.firstChild)
      regularview.firstChild.lastChild.firstChild.nodeValue = 'Exit';

   var dropzone = regularview.parentNode.parentNode; // = TR

   var copy_td = document.createElement('TD');
   copy_td.id = 'copy_td';

   // add new TD before the 'regular view' TD
   // that is hidden until in fullscreen mode
   dropzone.insertBefore( copy_td, regularview.parentNode );

   var node = document.createElement('BUTTON');
   node.id = 'copytext';
   node.innerHTML = 'Copy';
   node.addEventListener('click', CopyText, 'false');

   copy_td.appendChild ( node );

   var list = document.createElement('SELECT');
   list.id = 'copytext_method';
   list.innerHTML = '<OPTION value="html">HTML</OPTION>'
                  + '<OPTION value="formatted" SELECTED>Lines</OPTION>'
                  + '<OPTION value="text">Text</OPTION>';

   copy_td.appendChild ( list );

   // when the user changes pages, the Copy button needs to be reset
   jtp_form.addEventListener('submit', ResetCopyTextButton, 'false');

   arrows[0].addEventListener('click', ResetCopyTextButton, 'false');
   arrows[1].addEventListener('click', ResetCopyTextButton, 'false');
}

var output_param = /output=([^&]*)/;

var output_mode = 'images';
if (output_param.exec ( location.href ) )
   output_mode = RegExp.$1;

AddCopyAction ();

if (GM_registerMenuCommand) {
   GM_registerMenuCommand('Copy text to clipboard', CopyText);
}