User:Ineuw/Sandbox1

From Wikisource
Jump to: navigation, search
 

typoscan.js documentation and scripts

 


Introduction[edit]

  • The typoscan.js script highlights out of place characters and anomalies in the text created by the OCR process, but it is not a spell checking script. The typos are highlighted on the pages of the the Main namespace, Page namespace, and Page Preview, but not in edit mode.
  • There are two versions, one of which excludes highlighting text in the Main namespace.

Examples[edit]

  • Out of place characters are considered to be . . . .

 Mix of letters with numbers and vice versa: 18G7 or 6eorge
 Mix of upper and lower characters: HoUse, or hOuse
 Word like modem, instead of modern.
 Punctuation preceded by space, excluding ellipsis: comma , and period . . . . . .
 Floating (unattached) punctuation and symbols, excluding emdash, or endash: " ' ( ) +, ^
 Opening quote " w" at the start of the word.
 Scan error of the N character.
 St or Ave when followed by space or a comma.

Some highlighted typo segments are to be examined for context. Incorrect Correct
whicli Pompeii.

Installation[edit]

  • Create a typoscan.js submodule of the common.js module named User:username/common.js/typoscan.js. Replace "username" with your username.
 

Inclusion of the Main namespace in highlighting typos[edit]

  • Currently, there are two versions:
  • For the inclusion of the Main namespace pages to be highlighted, copy this code into the submodule:
// 

// main namespace included version
 
var typoFuse;  // overriden by actions.limit
 
function HighlightTyposUnder( node, actions ){
    if( typoFuse >= 0 && node ){
    if( node.nodeType == 3 /* TEXT_NODE */ ){
      for( var I = 0; I < actions.patterns.length; I++ ){
        var pattern = actions.patterns[ I ];
 
        if( node.nodeValue.match( pattern ) ){
          if( node.parentNode ){
            node.parentNode.innerHTML = node.parentNode.innerHTML.replace( pattern, '<span style="' + actions.styling + '">$&</span>' );
            typoFuse--;
          }
        }
      }
    }
 
    if( node.childNodes.length ){
      for( var subnode=0; subnode < node.childNodes.length; subnode++ ){
        if( node.childNodes[ subnode ].getAttribute ){
          if( !/pagenumber/.test( node.childNodes[ subnode ].getAttribute( "class" ) ) ){
            HighlightTyposUnder( node.childNodes[ subnode ], actions );
          }
        } else {
          HighlightTyposUnder( node.childNodes[ subnode ], actions );
        }
      }
    }
  }
}
 
function HighlightTyposLike( actions ){
  self.typoscan = self.typoscan || { exclude: true };
 
  if( !( actions.exclude ) ){       //don't bother scanning historical, edit-in-progress or given ns pages.
 
    var content=document.getElementById('wikiPreview'); //presume currently editing page (must not touch unsafe structures like wpEditToken!)
 
    if( !content ){
                content=document.getElementById('mw-content-text'); //user not currently editing: assume safe to address entire display region
          }
 
    typoFuse = actions.limit;

    for( var N=0; N<actions.groups.length; N++ ){
      if( actions.groups[ N ].include ){
        HighlightTyposUnder( content, actions.groups[ N ] );
      }
    }
  }
}
 
self.typoscan={
  exclude:
    /(action=history|(diff|oldid|search)=|(author|category|extension|file|help|index|mediawiki|meta|module|special|talk|template|topic|user|wiki([mp]edia|source))(:|%3A))/i.test( location.href ),
  limit:
    40,
  groups: [
    {
      include:
        true,                                       // this group applies to all pages not already excluded
      patterns: [
        /\\\S/g,                                    // back-slash escape?
        /;[.,:]/g,                                  // chained punctuation, semicolon-led
        /,[.,;:]/g,                                 // chained punctuation, comma-led
        /:[.,;]/g,                                  // chained punctuation, colon-led
        /(^|[^'])Ave(?!([.!]| Maria))(\W|$)/g,      // typo of "we" (but "'Ave", "Ave." or "Ave Maria" is O.K.)
        /\|[-+]?/g,                                 // wikicode leaking into HTML  (yes the '?' is dodgy but enlarges the match for table caption/row)
        /\Wim(der|desirable|less|productive)/g,     // typo of "under"/"undesirable"/"unless"/"unproductive"
        /(?!na)vv(?!(ies|y))/g,                     // typo of "w" (though legitimate in "navvy" or plural)
        /(^|\W)Sts?(?!\.)(\W|$)/g,                  // typo of "St." 
        /\s["'`;:,!?$%*()=+~]\s/g,                  // floating punctuation mark: WARNING: modern style: floating "=" OK
        /modem/gi,                                  // typo of "modern"
        /\w&/g,                                     // embedded or trailing "&"
        /&(?!c\.)\w/g,                              // leading "&" ("&c." O.K.)
        /(^|\W)[a-z]+[A-Z]+[A-Za-z]+(\W|$)/g,       // upper case embedded within lower case word
        /(^|\W)[A-Z]{2,}[a-z]+[A-Za-z]+(\W|$)/g,    // lower case embedded within upper case word
        /(^|\W)[a-zA-Z]+\d+[a-zA-Z]*(\W|$)/g,       // digit embedded within word
        /(^|\W)\d+[a-zA-Z]+\d+(\W|$)/g,             // alphabetic embedded within digits
        /(^|\W)[a-zA-Z]+ \.[a-zA-Z]+(\W|$)/g,       // period surrounded by letters
        / tlie /g,                                  // typo of "he"
        /li /g,                                         // typo of "h"
        / Av/g,                                                                         // " w" at the start of the word
        / op /g,                                    // typo of standalone " of "
        /lI /g,                                     // typo of "ll" or "h"
        /ii/g,                                      // "u" or ü"
        /jj/g,                                      // "p" or "g"
        /\^/g,                                      // standalone "^"?
        /{[[\]]}{1,}/g,                             // mis-terminated template, link or standalone "^"?
        /{[(\)}^]{1,}/g                             // plagiarised parenthesis
      ],
      styling:
        'background:LightSalmon;'
    },
// end of add
  ]
};

jQuery( document ).ready(
  HighlightTyposLike( self.typoscan )
);

// 
 

Exclusion of the Main namespace from highlighting typos[edit]

  • For the exclusion of the Main namespace pages from highlighting typos, copy this code into the submodule:
//  

// main namespace excluded version

var typoFuse;  // overriden by actions.limit
 
function HighlightTyposUnder( node, actions ){
    if( typoFuse >= 0 && node ){
    if( node.nodeType == 3 /* TEXT_NODE */ ){
      for( var I = 0; I < actions.patterns.length; I++ ){
        var pattern = actions.patterns[ I ];
 
        if( node.nodeValue.match( pattern ) ){
          if( node.parentNode ){
            node.parentNode.innerHTML = node.parentNode.innerHTML.replace( pattern, '<span style="' + actions.styling + '">$&</span>' );
            typoFuse--;
          }
        }
      }
    }
 
    if( node.childNodes.length ){
      for( var subnode=0; subnode < node.childNodes.length; subnode++ ){
        if( node.childNodes[ subnode ].getAttribute ){
          if( !/pagenumber/.test( node.childNodes[ subnode ].getAttribute( "class" ) ) ){
            HighlightTyposUnder( node.childNodes[ subnode ], actions );
          }
        } else {
          HighlightTyposUnder( node.childNodes[ subnode ], actions );
        }
      }
    }
  }
}
 
function HighlightTyposLike( actions ){
  self.typoscan = self.typoscan || { exclude: true };
 
  if( !( actions.exclude ) ){       //don't bother scanning historical, edit-in-progress or given ns pages.
 
    var content=document.getElementById('wikiPreview'); //presume currently editing page (must not touch unsafe structures like wpEditToken!)
 
    if( !content ){
                content=document.getElementById('mw-content-text'); //user not currently editing: assume safe to address entire display region
          }
 
    typoFuse = actions.limit;

    for( var N=0; N<actions.groups.length; N++ ){
      if( actions.groups[ N ].include ){
        HighlightTyposUnder( content, actions.groups[ N ] );
      }
    }
  }
}
 

self.typoscan={
  exclude:
    /(action=history|(diff|oldid|search)=|(author|category|extension|file|help|index|mediawiki|meta|module|special|talk|template|topic|user|wiki([mp]edia|source))(:|%3A))/i.test( location.href ),
  limit:
    40,
  groups: [
    {
      include:
        document.evaluate(
          "//body[contains(@class,'ns-0')]",
          document,
          null,
          XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
          null
        ).snapshotLength===0, // skip pages in main name-space...
      patterns: [
        /\\\S/g,                                    // back-slash escape?
        /;[.,:]/g,                                  // chained punctuation, semicolon-led
        /,[.,;:]/g,                                 // chained punctuation, comma-led
        /:[.,;]/g,                                  // chained punctuation, colon-led
        /(^|[^'])Ave(?!([.!]| Maria))(\W|$)/g,      // typo of "we" (but "'Ave", "Ave." or "Ave Maria" is O.K.)
        /\|[-+]?/g,                                 // wikicode leaking into HTML  (yes the '?' is dodgy but enlarges the match for table caption/row)
        /\Wim(der|desirable|less|productive)/g,     // typo of "under"/"undesirable"/"unless"/"unproductive"
        /(?!na)vv(?!(ies|y))/g,                     // typo of "w" (though legitimate in "navvy" or plural)
        /(^|\W)Sts?(?!\.)(\W|$)/g,                  // typo of "St." 
        /\s["'`;:,!?$%*()=+~]\s/g,                  // floating punctuation mark: WARNING: modern style: floating "=" OK
        /modem/gi,                                  // typo of "modern"
        /\w&/g,                                     // embedded or trailing "&"
        /&(?!c\.)\w/g,                              // leading "&" ("&c." O.K.)
        /(^|\W)[a-z]+[A-Z]+[A-Za-z]+(\W|$)/g,       // upper case embedded within lower case word
        /(^|\W)[A-Z]{2,}[a-z]+[A-Za-z]+(\W|$)/g,    // lower case embedded within upper case word
        /(^|\W)[a-zA-Z]+\d+[a-zA-Z]*(\W|$)/g,       // digit embedded within word
        /(^|\W)\d+[a-zA-Z]+\d+(\W|$)/g,             // alphabetic embedded within digits
        /(^|\W)[a-zA-Z]+ \.[a-zA-Z]+(\W|$)/g,       // period surrounded by letters
        / tlie /g,                                  // typo of "he"
        /li /g,                                         // typo of "h"
        / Av/g,                                                                         // " w" at the start of the word
        / op /g,                                    // typo of standalone " of "
        /lI /g,                                     // typo of "ll" or "h"
        /ii/g,                                      // "u" or ü"
        /jj/g,                                      // "p" or "g"
        /\^/g,                                      // standalone "^"?
        /{[[\]]}{1,}/g,                             // mis-terminated template, link or standalone "^"?
        /{[(\)}^]{1,}/g                             // plagiarised parenthesis
      ],
      styling:
        'background:LightSalmon;'
    },
// end of add
  ]
};

jQuery( document ).ready(
  HighlightTyposLike( self.typoscan )
);

// 
 

Activation[edit]

  • The script is activated by the following code placed in the common.js. Use the following script and replace "username" with your username.
//activate typoscan script of User:username/common.js/typoscan.js
mw.loader.load('//en.wikisource.org/w/index.php?title=User:username/common.js/typoscan.js&action=raw&ctype=text/javascript');