User:Pathoschild/standardise-dev.js

From Wikisource
Jump to navigation Jump to search
Note: After saving, changes may not occur immediately. Click here to learn how to bypass your browser's cache.
  • Firefox / Safari: Hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (Cmd-R on a Mac)
  • Google Chrome: Press Ctrl-Shift-R (Cmd-Shift-R on a Mac)
  • Internet Explorer: Hold Ctrl while clicking Refresh, or press Ctrl-F5
  • Opera: Clear the cache in Tools → Preferences

For details and instructions about other browsers, see Wikipedia:Bypass your cache.

// <pre><nowiki>
/************************
*********
********* This is the cutting-edge development version, and is frequently
********* broken. You should use [[User:Pathoschild/standardise.js]] instead.
*********
************************/

/*************
*** Wikisource standardization extension (development version)
*** for the Regex menu framework 1.2+, http://meta.wikimedia.org/wiki/User:Pathoschild/Script:Regex_menu_framework
*** by [[m:user:Pathoschild]]
*************/
function standardize() {
	/******************
	*** Content exceptions
	******************/
	/* exception pattern */
	var pattern = '<(nowiki|poem|pre)[^>]*>[\\s\\S]*?<\\/\\1>'; // double-escaping needed for RegExp()

	/* store exceptions in an array */
	var patternlocal = new RegExp(pattern, 'ig');
	var exceptionvalues = editbox.value.match(patternlocal);

	if(editbox.value.match(pattern)) {
		/* replace exceptions with placeholders */
		var patternlocal = new RegExp(pattern, 'i');
		for(var x=0; x<exceptionvalues.length; x++) {
			editbox.value = editbox.value.replace(patternlocal, '~exception~');
		}
	}

	/*******************
	*** Mainspace
	*******************/
	if(wgNamespaceNumber=='0') {
		/*******************
		*** header normalization
		*******************/
		/* prepare template for parsing if present */
		if(editbox.value.match(/{{\s*(?:msg:|template:)?\s*header/i)) {
			/* header syntax */
			regex(/{{\s*(?:msg:|template:)\s*header/i,'{{header');

			/* fix delimiters */
			// header parameters
			regex(/({{header2?[^\n]*)[\n\s]*\|[\s\n]*(previous|next|title|section|author|notes)\s*=\s*/ig,'$1~$2=',6);

			// nested templates (up to 5)
			regex(/({{header[\s\S]*?{{[^\|}]*)\|/ig,'$1%%pipe%%',5);
			regex(/({{header[\s\S]*?){{([^}]+)}}/ig,'$1%%leftcurlies%%$2%%rightcurlies%%',5);

			/* cleanup header */
			// deprecated arrows and brackets
			regex(/((?:previous|next)[^~]*?)(?:&larr|&rarr;|←|→)/ig,'$1');
			regex(/(section\s*=\s*)\(([^~}]*)\)/,'$1$2');

			// trailing whitespace
			regex(/{{(header[^}]+)\s*}}/i,'{{$1}}');

			// non-semantic line-breaks
			regex(/({{header[^}]+(?:title|section|author|section)=[^~]*)<[^>]*br[^>]*>/ig,'$1',8);
		}

		/* prepare values */
		var headertemplate = String(editbox.value.match(/{{header[\s\S]+?}}/i));

		/* parse */
		// title
		var pattern = /^[\s\S]*~title=([^~}]*)[\s\S]*$/;
		if(headertemplate.match(pattern)) {
			var headertitle = headertemplate.replace(pattern,'$1');
		}
		else {
			var headertitle = wgTitle.match(/^[^\/]+/);
		}

		// author
		var pattern = /^[\s\S]*~author=([^~}]*)[\s\S]*$/;
		if(headertemplate.match(pattern)) {
			var headerauthor = headertemplate.replace(pattern,'$1');
		}
		else {
			var headerauthor = '';
		}

		// section
		var pattern = /^[\s\S]*~section=([^~}]*)[\s\S]*$/;
		if(headertemplate.match(pattern)) {
			var headersection = headertemplate.replace(pattern,'$1');
		}
		else {
			if(wgTitle.match(/.+\//)) {
				var newtitle = wgTitle.replace(/^.*\/([^\/]+)/);
				var headersection = wgTitle.replace(/^.*\/([^\/]+)/,'$1');
			}
			else {
				var headersection = '';
			}
		}

		// previous
		var pattern = /^[\s\S]*~previous=([^~}]*)[\s\S]*$/;
		if(headertemplate.match(pattern)) {
			var headerprevious = headertemplate.replace(pattern,'$1');
		}
		else {
			if(wgTitle.match(/\/Chapter \d+$/)) {
				var newtitle = parseInt(wgTitle.replace(/^.*\/Chapter (\d+)$/,'$1'))-1;

				if(newtitle>0) {
					var headerprevious = '[[../Chapter '+newtitle+'|Chapter '+newtitle+']]';
				}
				else {
					var headerprevious = '';
				}
			}
			else {
				var headerprevious = '';
			}
		}

		// next
		var pattern = /^[\s\S]*~next=([^~}]*)[\s\S]*$/;
		if(headertemplate.match(pattern)) {
			var headernext = headertemplate.replace(pattern,'$1');
		}
		else {
			if(wgTitle.match(/\/Chapter \d+$/)) {
				var newtitle = parseInt(wgTitle.replace(/^.*\/Chapter (\d+)$/,'$1'))+1;
				var headernext = '[[../Chapter '+newtitle+'|Chapter '+newtitle+']]';
			}
			else {
				var headernext = '';
			}
		}

		// notes
		var pattern = /^[\s\S]*~notes=([^~}]*)[\s\S]*$/;
		if(headertemplate.match(pattern)) {
			var headernotes = headertemplate.replace(pattern,'$1');
			headernotes = headernotes.replace(/^([\s\S]*?)[\n\s]*$/,'$1'); // trailing whitespace
		}
		else {
			var headernotes = '';
		}

		/* remove old template */
		editbox.value = editbox.value.replace(/{{header[^}]*}}\n*/ig,'');

		/* place new template */
		editbox.value = '{{header2'
		   + '\n | title    = '+headertitle
		   + '\n | author   = '+headerauthor
		   + '\n | section  = '+headersection
		   + '\n | previous = '+headerprevious
		   + '\n | next     = '+headernext
		   + '\n | notes    = '+headernotes
		   + '\n}}\n'
		   + editbox.value;

		/*******************
		*** fix false newlines
		*******************/
		/* replace newlines with placeholders */
		regex(/([^\n])\n(\s*[^\n])/ig,'$1%%newline%%$2');
		
		/* selectively restore legitimate newlines */
		// paragraphs
		regex(/%%newline%%%%newline%%|%%newline%%\n|\n%%newline%%/ig,'\n\n');

		// templates
		regex(/%%newline%%(\s*(?:}}|\|))/ig,'\n$1'); // before
		regex(/}}\s*%%newline%%/ig,'}}\n'); // after

		// images, categories, interwiki links
		regex(/%%newline%%(\s*\[\[(?:Image|Category|[^:]+):[^\]]+\]\])/ig,'\n$1');
		regex(/(\s*\[\[(?:Image|Category|[^:]+):[^\]]+\]\])%%newline%%/ig,'$1\n');

		// lists
		regex(/%%newline%%([*#:;])/ig,'\n$1'); // lists
		regex(/([*#:;][^\n]*)%%newline%%/ig,'$1\n'); // newlines closing list items

		// tables
		regex(/%%newline%%{\|/ig,'\n{|');
		regex(/{\|%%newline%%/ig,'{|\n');

		// rules
		regex(/%%newline%%(----+)/g,'\n$1');
		regex(/(----+)%%newline%%/g,'$1\n');
		
		// tags
		regex(/(<[^>\n]+>)\s*%%newline%%/ig,'$1\n');
		regex(/%%newline%%(<[^>\n]+>)/ig,'\n$1');

		/* remove remaining */
		regex(/-%%newline%%([^\s])/ig,'-$1'); // hyphenated words
		regex(/\s*%%newline%%\s*/ig,' '); // all others
		
		/*******************
		*** Cleanup
		*******************/
		/* restore delimiters */
		regex(/%%pipe%%/g,'|');
		regex(/%%leftcurlies%%/g,'{{');
		regex(/%%rightcurlies%%/g,'}}');
	}
	
	/*******************
	*** Authorspace
	*******************/
	if(wgNamespaceNumber=='102') {
		/*******************
		*** {{author}} normalization
		*******************/
		/* fix delimiters */
		regex(/[\n\s]*\|\s*((?:first|last)name|last_initial|(?:birth|death)year|description|image|(?:wikipedia|wikiquote|commons)_link|dates|name|defaultsort)\s*=\s*/ig,'~$1='); // author parameter delimiters
		regex(/({{author[\s\S]*?{{[^\|}]*)\|/ig,'$1%%pipe%%',5); // other template pipes
		regex(/({{author[\s\S]*?){{([^}]+)}}/ig,'$1%%leftcurlies%%$2%%rightcurlies%%',5); // other template delimiters
		
		/* cleanup */
		regex(/{{(author[^}]+)\s*}}/i,'{{$1}}'); // rm trailing whitespace
		
		/* place standard template and move like parameters */
		regex(/{{author/i,'{{author\n |firstname      =\n |lastname       =\n |last_initial   =\n |birthyear      =\n |deathyear      =\n |description    =\n |image          =\n |wikipedia_link =\n |wikiquote_link =\n |commons_link   =\n}}\n{{author');
		regex(/(author[\s\S]*?\|((?:first|last)?name|last_initial|(?:birth|death)year|description|image|(?:wikipedia|wikiquote|commons)_link|dates|defaultsort)\s*)=([\s\S]*?)~?\2=([^~]*)/i,'$1=$4$3',10);
		
		/* get dates if necessary */
		if(regsearch(/(?:birth|death)year\s*=\s*\n/)) {
			// cannibalise categories
			regex(/(birthyear\s*)=(\s*\n[\s\S]*?)\n?\[\[\s*Category\s*:\s*(\d+(?:\s*BCE)?) births\s*[^\]]*\]\]/,'$1=$3$2');
			regex(/(deathyear\s*)=(\s*\n[\s\S]*?)\n?\[\[\s*Category\s*:\s*(\d+(?:\s*BCE)?) deaths\s*[^\]]*\]\]/,'$1=$3$2');
			
			// if that failed, parse from old template
			if(regsearch(/(?:birth|death)year\s*=\s*\n/) && regsearch(/~dates=[^~]/)) {
				/* get dates */
				// get raw parameter
				var olddates = editbox.value.replace(/^[\s\S]*dates=[^\d~}]*([^~}]+)[\s\S]*$/,'$1'); // raw parameter
				olddates = olddates.replace(/^(\d+)\s*BC?E/ig,'$1 BCE'); // fix eras
								
				// extract dates
				var birthyear = olddates.replace(/^(\d+(?: BCE)?)[\s\S]*$/ig,'$1');
				var deathyear = olddates.replace(/^\d+[^\d]+?(\d+(?: BCE)?)$/ig,'$1');
				
				/* fill in empty parameters */
				if(regsearch(/birthyear\s*=\s*\n/)) {
					regex(/(birthyear\s*)=/,'$1='+birthyear);
				}
				if(regsearch(/deathyear\s*=\s*\n/) &&deathyear>birthyear) {
					regex(/(deathyear\s*)=/,'$1='+deathyear);
				}
			}
		}
		
		/* get names */
		if(regsearch(/(?:first|last)name\s*=\s*\n/)) {
			// cannibalise name field
			if(regsearch(/(?:first|last)name\s*=\s*\n/)) {
				// extract
				var name = editbox.value.replace(/^[\s\S]*~name=([^~}]*)[\s\S]*/,'$1');
				var firstname = name.replace(/([\s\S]+)\s+[\s\S]*/,'$1');
				var lastname = name.replace(/[\s\S]+\s+([\s\S]*)/,'$1');
									
				// fill in empty parameters
				if(regsearch(/firstname\s*=\s*\n/)) {
					regex(/(firstname\s*)=/,'$1='+firstname);
				}
				if(regsearch(/lastname\s*=\s*\n/)) {
					regex(/(lastname\s*)=/,'$1='+lastname);
				}
			}
		}
		/* cleanup */
		// remove old template
		regex(/({{author[\s\S]*?)[\n\s]*{{author[^}]*}}[\n\s]*/ig,'$1\n\n');

		// restore delimiters
		regex(/%%pipe%%/g,'|');
		regex(/%%leftcurlies%%/g,'{{');
		regex(/%%rightcurlies%%/g,'}}');
		
		// fix whitespace
		regex(/((?:(?:first|last)name|last_initial|(?:birth|death)year|description|image|(?:wikipedia|wikiquote|commons)_link)\s*)=\s*/ig,'$1= ');
		regex(/= \|/g,'= \n |');
		regex(/= }}/g,'= \n}}');
		
		/* remove old categories */
		regex(/\[\[\s*Category\s*:\s*\d+[^\]]*?(?:births|deaths)[^\]]*\]\]\n?/ig,''); // authors by year
		regex(/\[\[\s*Category\s*:\s*(?:Ancient|Early modern|Medieval|Modern|Renaissance) authors[^\]]*\]\]\n?/ig,''); // authors by era
		
		/*******************
		*** Other tweaks
		*******************/
		/* update license templates */
		regex(/{{\s*(?:msg:|template:)?(?:author-)?(PD-[^\|\}]+)(?:\|[^}]*)?}}/ig,'{{$1}}');
		
		/* normalize dates */
		regex(/^([#*:]+ \[\[[^\]]+\]\]),\s*(\d+)/mig,'$1 ($2)');
	}
	
	/*******************
	*** miscellaneous cleanup
	*******************/
	/*   templates */
	regex(/{{\s*(?:msg:|template:)?([^}]+)}}/ig,'{{$1}}');
	
	/* syntax */
	// headers
	regex(/\n*^(=+)\s*(.*?)\s*\1\s*/mig,'\n\n$1$2$1\n'); // whitespace
	regex(/=\n+=/ig,'=\n='); // fix consecutive headers
	
	// categories
	regex(/\[\[\s*category\s*:\s*([^\|\]]+)(?:\s*(\|)([^\]]*))?\s*\]\]/ig,'[[category:$1$2$3]]');
	
	//links
	regex(/\[\[\s*([^\|\]]+?)\s*(?:(\|)\s*([^\]]+?)\s*)?\]\]/ig,'[[$1$2$3]]'); // redundant starting and ending whitespace
	regex(/\[\[([^\|\]]+?)\s*\|\s*\1\]\]/ig,'[[$1]]'); // redundant link text
	regex(/\[\[([^\|\]]+?)_/ig,'[[$1 ',5); // underscores
	
	// lists
	regex(/^([*#:]+)\s*/mig,'$1 ');
	
	
	/*******************
	*** sort elements
	*******************/
	/* store elements and remove from code */
	// categories
	var categories = regsearch(/\[\[category:[^\]]+\]\]/ig);
	regex(/\[\[category:[^\]]+\]\]\n?/ig,'');
	
	// interlanguage links
	var interwikilinks = regsearch(/\[\[[a-z]{2,3}(?:-[^:\|\]]+)?:[^\]]+\]\]/ig); // get codes
	if(interwikilinks) {
		for(var x in interwikilinks) { // filter out known non-interlanguage prefixes
			if(interwikilinks[x].match(/\[\[(?:c2|cej|dcc|mw|rev|rfc|svn|wqy):/i)) {
				interwikilinks.splice(x,0);
			}
			else {
				var pattern = new RegExp(interwikilinks[x]+'\n?','ig');
				regex(/\[\[[a-z]{2,3}(?:-[^:\|\]]+)?:[^\]]+\]\]\n?/ig,'');
			}
		}
	}
	
	// license templates
	var licenses = regsearch(/{{(?:PD-|GFDL)[^}]*}}/ig);
	regex(/{{(?:PD-|GFDL)[^}]*}}\n?/ig,'');
	
	/* sort and re-add */
	// compare function for case-insensitivity
	// courtesy <http://www.webreference.com/js/tips/000430.html>
	function caseless(a,b) {
		var a = a.toLowerCase();
		var b = b.toLowerCase();
		if (a < b) return -1;
		if (a > b) return 1;
		return 0;
	}

	// initial whitespace
	regex(/[\s\n]*$/,'\n\n');

	// licenses
	if(licenses && licenses.length>0) {
		// licenses
		for(var x in licenses) {
			editbox.value = editbox.value+licenses[x]+'\n';
		}
		// whitespace
		editbox.value = editbox.value+'\n';
	}

	// categories
	if(categories && categories.length>0) {
		// sort and place
		categories.sort(caseless);
		for(var x in categories) {
			editbox.value = editbox.value+categories[x]+'\n';
		}
		// whitespace
		editbox.value = editbox.value+'\n';
	}
		
	// interlanguage links
	if(interwikilinks) {			
		// sort and place
		interwikilinks.sort(caseless);
		for(var x in interwikilinks) {
			editbox.value = editbox.value+interwikilinks[x]+'\n';
		}
	}

	/* restore exceptions */
	if(editbox.value.match(/~exception~/)) {
		/* restore placeholders */
		for(var i=0; i<exceptionvalues.length; i++) {
			var pattern = new RegExp('~exception~');
			editbox.value = editbox.value.replace(pattern, exceptionvalues[i]);
		}
	}
	
	/* edit summary */
	setreason('[[WS:STYLE|standardization]], updates, and cleanup with [[m:User:Pathoschild/Script:Regex menu framework|regex]]');
}
// </nowiki></pre>