User:Ostrea/proofreading.js

From Wikisource
Jump to navigation Jump to search
Note: After saving, changes may not occur immediately. Click here to learn how to bypass your browser's cache.
  • Firefox / Safari: Hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (Cmd-R on a Mac)
  • Google Chrome: Press Ctrl-Shift-R (Cmd-Shift-R on a Mac)
  • Internet Explorer: Hold Ctrl while clicking Refresh, or press Ctrl-F5
  • Opera: Clear the cache in Tools → Preferences

For details and instructions about other browsers, see Wikipedia:Bypass your cache.

/*



This page defines a TemplateScript library. It's not meant to be referenced
directly. See [[Wikisource:TemplateScript]] for usage.



*/
/* global $, pathoschild */

/**
 * TemplateScript adds configurable templates and scripts to the sidebar, and adds an example regex editor.
 * @see https://meta.wikimedia.org/wiki/TemplateScript
 * @update-token [[File:Pathoschild/templatescript.js]]
 */
// <nowiki>
$.ajax('//tools-static.wmflabs.org/meta/scripts/pathoschild.templatescript.js', { dataType:'script', cache:true }).then(function() {
	/*********
	** Define library
	*********/
	pathoschild.TemplateScript.library.define({
		key: 'wikisource.proofreading',
		name: 'Proofreading tools',
		url: '//en.wikisource.org/wiki/Wikisource:TemplateScript#Proofreading',
		description: 'A set of scripts for <a href="/wiki/Help:Proofreading">proofreading works in the <tt>Page:</tt> namespace</a>. This includes tools for cleaning up OCR, generating page templates, and adding common text formatting.',
		categories: [
			{
				name: 'Page tools',
				scripts: [
					{ key: 'add-header', name: 'Add header', script: function(editor) { addPageHeader(editor); }, forNamespaces: 'page' },
					{ key: 'add-footer', name: 'Add footer', script: function(editor) { addPageFooter(editor); }, forNamespaces: 'page' },
					{ key: 'cleanup-ocr', name: 'Clean up OCR', script: function(editor) { pageCleanup(editor); }, forNamespaces: 'page' },
					{ key: 'make-ref', name: 'Make reference', script: function(editor) { makeReference(editor); }, forNamespaces: 'page' },
					{ key: 'smallcaps', name: 'Convert to small-caps', script: function(editor) { smallcaps(editor); }, forNamespaces: 'page' },
					{ key: 'uppercase', name: 'Convert to uppercase', script: function(editor) { upper(editor); }, forNamespaces: 'page' }
				]
			}
		]
	});

	/*********
	** Page context
	*********/
	var state = {
		initialised: false,  // whether the page context has been initialised
		page: {
			number: null,   // the djvu page number extracted from the URL
			proofed: null
		},
		specialFormats: [] // work-specific header template formats
	};

	/*********
	** Private methods
	*********/
	/**
	 * Initialise the data needed by the page tools.
	 */
	var _initialise = function() {
		// only initialise once
		if(state.initialised)
			return;
		state.initialised = true;

		// get page metadata
		var pn = /\.djvu\/([0-9]+)&action=edit/g.exec(location.href);
		var pq = document.getElementById('pagequality');
		state.page = {
			number: pn !== null ? parseInt(pn[1], 10) : null,
			proofed: pq && pq.getAttribute('class') && pq.getAttribute('class').match(/quality0|quality[2-4]/)
		};

		// get user-defined work formats
		// expected format:
		//   {
		//      title: /History of England /,
		//      evenHeader: '{{rh|...}}',
		//      oddHeader: '{{rh|...}}',
		//      footer: '',
		//      footerWithReferences: '{{smallrefs}}'
	 	//   }
		state.specialFormats = [];
		if(window.specialFormats)
			state.specialFormats = state.specialFormats.concat(window.specialFormats);
	};
	
	/**
	 * Convert the text to title case based on English rules.
	 * @param {string} text The text to convert.
	 */
	 
	 /**
	var _titlecase = function(text) {
		// split text into individual words and examine them one by one
		var words = text.toLowerCase().split(" ");
		$.each(function(i, word) {
			switch(word) {
				case "a":
				case "an":
				case "and":
				case "as":
				case "at":
				case "but":
				case "by":
				case "etcetera":
				case "etc.":
				case "for":
				case "from":
				case "in":
				case "nor":
				case "of":
				case "o'":
				case "on":
				case "or":
				case "the":
				case "to":
				case "with":
				case "versus":
				case "vs.":
				case "v.":
				case "yet":
					break; // don't capitalise articles, "to" as part of an infinitive, prepositions or short conjunctions
				default: // capitalise everything else
					words[i] = word.substring(0, 1).toUpperCase() + word.substring(1, words[i].length);
					break;
			}
		});

		// capitalise first word regardless
		words[0] = words[0].substring(0, 1).toUpperCase() + words[0].substring(1, words[0].length);

		// capitalise last word regardless
		var last = words.length-1;
		words[last] = words[last].substring(0, 1).toUpperCase() + words[last].substring(1, words[last].length);

		// reconstruct title
		return words.join(' ');
		
	};*/

	/*********
	** Script methods
	*********/
	/**
	 * Add a {{running header}} template to the page.
	 * @param {object} editor The script helpers for the page.
	 */
	var addPageHeader = function(editor) {
		_initialise();
		
		if(state.page.number === null)
			return;

		var isEven = (state.page.number % 2 === 0);
		var generic = true;
		var headertext = '';
		
		for (var f in state.specialFormats) {
			var format = state.specialFormats[f];
			if (mw.config.get('wgTitle').match(format.title)) {
				headertext = isEven ? format.evenHeader : format.oddHeader;
				generic = false;
				break;
			}
		}

		// no special header matched, use a generic running header
		if (generic) {
			if (isEven)
				headertext = '{{running header|left=|center=}}'; // assume verso, with page number at left
			else
				headertext = '{{running header|center=|right=}}';
		}
		
		$('#wpHeaderTextbox').val(function(i, val) {
			return $.trim(val + '\n' + headertext);
		});

		// if this is unproofed text, then delete the first line of the OCR text, which presumably is raw OCR of the header we've just inserted
		if (!state.page.proofed) {
			var text = editor.get();
			editor.set(text.slice(text.indexOf('\n') + 1));
		}
	};

	/**
	 * Clean up OCR errors in the text, and push <noinclude> content at the top
	 * & bottom of the page into the header & footer boxes respectively.
	 * @param {object} editor The script helpers for the page.
	 */
	var pageCleanup = function(editor) {
		_initialise();
		
		// push <noinclude> content at the top & bottom into the header & footer
		if (editor.get().match(/^<noinclude\>/)) {
			var text = editor.get();
			var e = text.indexOf("</noinclude>");
			$('#wpHeaderTextbox').val(function(i, val) {
				return $.trim(val + "\n" + text.substr(11, e-11).replace(/^\s+|\s+$/g, ''));
			});
			editor.set(text.substr(e+12));
		}
		if (editor.get().match(/<\/noinclude\>$/)) {
			var text = editor.get();
			var s = text.lastIndexOf("<noinclude>");
			$('#wpFooterTextbox').val(function(i, val) {
				return $.trim(text.substr(s+11, text.length-s-11-12).replace(/^\s+|\s+$/g, '') + "\n" + val);
			});
			editor.set(text.substr(0, s));
		}
		
		// clean up text
		editor
			// remove trailing spaces at the end of each line
			.replace(/ +\n/g, '\n')

			// remove trailing whitespace preceding a hard line break
			.replace(/ +<br *\/?>/g, '<br />')

			// remove trailing whitespace and numerals at the end of page text
			// (numerals are nearly always page numbers in the footer)
			.replace(/[\s\d]+$/g, '')

			// remove trailing spaces at the end of refs
			.replace(/ +<\/ref>/g, '</ref>')
	
			// remove trailing spaces at the end of template calls
			.replace(/ +}}/g, '}}')
			
			
			// ajout
			// remplace --- par longdash
			.replace(/---/g, '{{longdash}}')
			
			//remplace les balises </sup> par un formatage poème centré petit
			
			.replace(/<sup>/g, '{{smaller|{{Block center|<poem>')
			.replace(new RegExp("</sup>", "g"), '</poem>}}}}')
			
			//supprime espace et double espace au début de la ligne
			.replace(/\n {2}/g, '\n')
			.replace(/\n {1}/g, '\n')
			
			//pareil au début de la page
			.replace(/^ {2}/g, '')
			.replace(/^ {1}/g, '')
			
			
			
				
		
			
			
	
			// convert double-hyphen to mdash (avoiding breaking HTML comment syntax)
			.replace(/([^\!])--([^>])/g, '$1—$2')
	
			// remove spacing around mdash, but only if it has spaces on both sides
			// (we don't want to remove the trailing space from "...as follows:— ",
			// bearing in mind that the space will already be gone if at end of line).
			.replace(/ +— +/g, '—')
	
			// join words that are hyphenated across a line break, and weird OCR hyphens (¬)
			// (but leave "|-" table syntax alone)
			.replace(/([^\|])[-¬]\n/g, '$1')
			;

		// clean up pages if they don't have <poem>
		if (!editor.contains('<poem>')) {
			editor
				// lines that start with " should probably be new lines,
				// if the previous line ends in punctuation,
				// other than a comma or semicolon
				// and let's get rid of trailing space while we're at it*/
				.replace(/([^\n\w,;])\n\" */g, '$1\n\n"')
	
				// lines that end with " should probably precede a new line,
				// unless preceded by a comma,
				// or unless the new line starts with a lower-case letter;
				// and let's get rid of preceding space while we're at it
				.replace(/([^,])\ *\"\n([^a-z\n])/g, '$1"\n\n$2')
	
				// remove single line breaks; preserve multiple.
				// but not if there's a tag, template or table syntax either side of the line break
				.replace(/([^>}\|\n])\n([^:#\*<{\|\n])/g, '$1 $2')
	
				// collapse sequences of spaces into a single space
				.replace(/  +/g, ' ')  
				;
		}
		
		// more page cleanup
		editor
			// dump spurious hard breaks at the end of paragraphs
			.replace(/<br *\/?>\n\n/g, '\n\n')

			// remove unwanted spaces around punctuation marks
			.replace(/ ([;:\?!,])/g, '$1')
	
			// unicodify
			.replace(/&mdash;/g, '—')
			.replace(/&ndash;/g, '–')
			.replace(/&quot;/g, '"')
	
			// straighten quotes and apostrophes.
			.replace(/[“”]/g, '"')
			.replace(/[‘’`]/g, '\'')
	
			//OCR fixes
			// convert i9 to 19, etc.
			.replace(/[il]([0-9])/g, '1$1')
	
			// "the", "them", "their", etcetera
			.replace(/tlie/g, 'the')
	
			// "U" -> "ll" when preceded by a lowercase letter.
			.replace(/([a-z])U/g, '$1ll')
	
			// "would", "could"
			.replace(/woidd/g, 'would')
			.replace(/coidd/g, 'could')
			.replace(/shoidd/g, 'should')
	
			// many works have apostrophes missing from OCR
			.replace(/([a-z]) s\b/g, '$1\'s') // it's he's etc
			.replace(/n t\b/g, 'n\'t') //can't isn't didn't etc
			.replace(/([a-zI]) ll\b/g, '$1\'ll') // I'll we'll etc
			.replace(/\bI m\b/g, 'I\'m') // I'm
			.replace(/\b([Yy])ou re\b/g, '$1ou\'re') // you're
			.replace(/\b([Ww])e re\b/g, '$1e\'re') // we're
			.replace(/\b([Tt])hey re\b/g, '$1hey\'re') // they're
			.replace(/([a-zI]) ve\b/g, '$1\'ve') // I've we've etc
	
			// expand diacritical templates
			.replace(/{{((ae|oe|\w[:`'~^-]))}}/g, '{{subst'+':$1}}')
	
			// replace "float center" with "block center"; original template name was misleading enough be warrant routinely fixing
			.replace(/\{\{float center/g, '{{block center')
	
			.replace(/<center>\s*([.\n]*?)\s*<\/center>/g, '{{center|$1}}')
			
			// ajouts
			
			// remplace "' '" par le bon template
//			.replace(/"'/g, '{{" \'}}')
//			.replace(/'"/g, '{{\' "}}')  
			
			// ,"" -> () converter
			
/*			.replace(/," /g, ', (')
			.replace(/, "/g, ') ')
			.replace(/; "/g, ';) ')
			.replace(/!" s/g, '! (s')
			.replace(/ -/g, '—')
			.replace(/!" c/g, '! (c')
			.replace(/Mrs /g, 'Mrs. ')
			.replace(/Mr /g, 'Mr. ')
			.replace(/St /g, 'St. ')
*/

			.replace(/ 1 /g, ' I ')
			.replace(/\(\(/g, '(')
			.replace(/,\(/g, ', (')
			
			//convertisseur anti long s ou f
			
			.replace(/fp/g, 'sp')
			.replace(/fs/g, 'ss')
			.replace(/ffu/g, 'ssu')
			.replace(/fex/g, 'sex')
			.replace(/ffion/g, 'ssion')
			.replace(/feldom/g, 'seldom')
			.replace(/fh/g, 'sh')
			.replace(/fign/g, 'sign')
			.replace(/fuch/g, 'such')
			.replace(/foon/g, 'soon')
			.replace(/addreff/g, 'address')
			.replace(/ufy/g, 'usy')
			.replace(/ fy/g, ' sy')
			.replace(/effi/g, 'essi')
			.replace(/ ft/g, ' st')
			.replace(/occafio/g, 'occasio')
			.replace(/ fent/g, ' sent')
			.replace(/fup/g, 'sup')
			.replace(/ufi/g, 'usi')
			.replace(/leaft/g, 'least')
			.replace(/fong/g, 'song')
			.replace(/ufu/g, 'usu')
			.replace(/faw/g, 'saw')
			.replace(/effed/g, 'essed')
			.replace(/fome /g, 'some ')
			.replace(/laft/g, 'last')
			.replace(/referv/g, 'reserv')
			.replace(/kiff/g, 'kiss')
			.replace(/feem/g, 'seem')
			.replace(/ fo /g, ' so ')
			.replace(/filen/g, 'silen')
			.replace(/fob/g, 'sob')
			.replace(/hafte/g, 'haste')
			.replace(/ fide/g, ' side')
			.replace(/feeing/g, 'seeing')
			.replace(/feem/g, 'seem')
			.replace(/feen/g, 'seen')
			.replace(/to rife/g, 'to rise')
			.replace(/ceaf/g, 'ceas')
			.replace(/eferv/g, 'eserv')
			.replace(/fecr/g, 'secr')
			.replace(/fc/g, 'sc')
			.replace(/ifun/g, 'isun')
			.replace(/fta/g, 'sta')
			.replace(/fk/g, 'sk')
			.replace(/fity/g, 'sity')
			.replace(/fta/g, 'sta')
			.replace(/fub/g, 'sub')
			.replace(/maft/g, 'mast')
			.replace(/hefe/g, 'hese')
			.replace(/fw/g, 'sw')
			.replace(/ofom/g, 'osom')
			.replace(/ſ/g, 's')
			.replace(/furp/g, 'surp')
			.replace(/ifed/g, 'ised')
			.replace(/fay/g, 'say')
			.replace(/felf/g, 'self')
			.replace(/pofi/g, 'posi')
			.replace(/uft/g, 'ust')
			.replace(/faid/g, 'said')
			.replace(/fearc/g, 'searc')
			.replace(/fto/g, 'sto')
			.replace(/( )fing([^e])/g, '$1sing$2') //will give trouble with "singed" and "singer"
			.replace(/efti/g, 'esti')
			.replace(/rft/g, 'rst')
			.replace(/moft/g, 'most')
			.replace(/dift/g, 'dist')
			.replace(/caft/g, 'cast')
			.replace(/nft/g, 'nst')
			.replace(/ufe/g, 'use')
			.replace(/fome([^n])/g, 'some$1')
			.replace(/hofe/g, 'hose')
			.replace(/faft/g, 'fast')
			.replace(/([^dlih])eft/g, '$1est') //will give trouble with "bereft"
			.replace(/lfo/g, 'lso')
			.replace(/horfe/g, 'horse')
			.replace(/ fet /g, ' set ')
			.replace(/paff/g, 'pass')
			.replace(/lofe/g, 'lose')
			.replace(/poff/g, 'poss')
			.replace(/fb/g, 'sb')
			.replace(/eafi/g, 'easi')
			.replace(/myf/g, 'mys')
			.replace(/fenfe/g, 'sense')
			.replace(/ftr/g, 'str')
			.replace(/taft/g, 'tast')
			.replace(/mif/g, 'mis')
			.replace(/ rof/g, ' ros')
			.replace(/fm/g, 'sm')
			.replace(/fible/g, 'sible')
			.replace(/fince/g, 'since')
			.replace(/fevera/g, 'severa')
			.replace(/([^hl])eft /g, '$1est ')
			.replace(/,\n/g, '.\n') //comma placed before a new line is turned into period
			.replace(/fervant/g, 'servant')
			
			
			//tesserakt wrangler
			.replace(/ſ/g, 's')
			.replace(/([a-z])\. ([a-z])/g, '$1 $2')
			.replace(/cd/g, 'ed')
			.replace(/cb/g, 'eb')
			.replace(/\* /g, '"')
			.replace(/\*/g, '')
			.replace(/« |«/g, '"')
			.replace(/© /g, '"')
			.replace(/1/g, 'I')
			.replace(/""/g, '"')
			.replace(/< /g, '')
			.replace(/= /g, '')
			.replace(/ ir /g, ' it ')
			.replace(/ Ir /g, ' It ')
			.replace(/\]|\[/g, 'I')
			.replace(/ \. /g, ' ')
			.replace(/ - /g, ' ')
			.replace(/©/g, '')
			.replace(/\?"\?/g, '?"')
			.replace(/([a-z])\'([a-z])/g, '$1 $2')
			.replace(/ -([a-z])/g, ' $1')
			.replace(/([a-z])- /g, '$1 ')
			.replace(/ ' /g, ' ')
			.replace(/([a-z])' /g, '$1 ')
			.replace(/ '([a-z])/g, ' $1')
			.replace(/ \.([a-z])/g, ' $1')
			.replace(/,,/g, ',')
			.replace(/([a-z])\.([a-z])/g, '$1 $2')
			.replace(/([a-z]) s /g, '$1\'s ')
			.replace(/([a-z]) s,/g, '$1\'s,')
			.replace(/([a-z]) s./g, '$1\'s.')
			.replace(/o clock/g, 'o\'clock')
			.replace(/([a-z]) ll /g, '$1\'ll ')
			.replace(/([a-z]) t /g, '$1\'t ')
			.replace(/([a-z]) re /g, '$1\'re ')
			.replace(/([a-z]) m /g, '$1\'m ')
			
				;
			
	};
	
	/**
	 * As you work your way through the page, when you encounter a reference, just mark it with <ref></ref> tags and continue.
	 * Once you've got to the end of the page and proofed the references, simply highlight each reference in turn,
	 * and use this function to move it to its proper position.
	 * @param {object} editor The script helpers for the page.
	 */
	var makeReference = function(editor) {
		_initialise();
		
		var editbox = $('#wpTextbox1').get(0);
		editbox.focus();
		var refStart = editbox.selectionStart;
		var refEnd = editbox.selectionEnd;

		var firstref = editbox.value.indexOf('<ref></ref>');
		if (firstref != -1) {
			editbox.value = editbox.value.slice(0,firstref+5)
			              + editbox.value.slice(refStart, refEnd)
			              + editbox.value.slice(firstref+5, refStart)
			              + editbox.value.slice(refEnd);
		}
	};

	/**
	 * Insert formatted references into the footer box if needed.
	 * @param {object} editor The script helpers for the page.
	 */
	var addPageFooter = function(editor) {
		_initialise();
		
		var editbox = $('#wpTextbox1').get(0);
		var footerbox = $('#wpFooterTextbox').get(0);
		var generic;
		var format;
		var f;
		if (editbox.value.indexOf("<ref>") == -1 && editbox.value.indexOf("{{#tag:ref") == -1) {
			// page contains no refs
			generic = true;
			for (f in state.specialFormats) {
				format = state.specialFormats[f];
				if (mw.config.get('wgTitle').contains(format.title)) {
					footerbox.value = format.footer;
					generic = false;
					break;
				}
			}

			// no special footer matched, use just strip out the references tag
			if (generic)
				footerbox.value = '';
		}
		else {
			generic = true;
			for (f in state.specialFormats) {
				format = state.specialFormats[f];
				if (mw.config.get('wgTitle').contains(format.title)) {
					footerbox.value = format.footerWithReferences;
					generic = false;
					break;
				}
			}

			// no special footer matched, so use a generic ref tag
			if (generic && doGeneric)
				footerbox.value = '{{block center|{{smallrefs}}}}';
		}
	};

	/**
	 * Mark the selected text with {{sc}}. If the text is uppercase, it will be converted to titlecase.
	 * @param {object} editor The script helpers for the page.
	 */
	var smallcaps = function(editor) {
		_initialise();
		
		editor.replaceSelection(function(text) {
			// Applying small-caps to all-caps text is pointless...
			// ... unless the all-caps is OCR of text that is actually small-caps.
			// Check if text is all-caps, and if it is, convert it to title case before applying small-caps.
			if (text == text.toUpperCase())
				text = text.toLowerCase();
			//	text = _titlecase(text);
			
			return '{{sc|' + text + '}}';
		});
	};

	/**
	 * Convert the text to uppercase.
	 * @param {object} editor The script helpers for the page.
	 */
	var upper = function(editor) {
		_initialise();
		
		editor.replaceSelection(function(text) {
			return text.toUpperCase();
		});
	};
});
// </nowiki>