/*
 * This is a simple client-side Javascript function for
 * converting the so-called "translit" Latvian characters (aa, ee, uu)
 * into the correct characters in another encoding.
 *    
 * Currently supported only ISO-8859-13, aka Windows-1257, as
 * it is the prevalent codepage in most webpages, but the
 * functionality can be modified easily enough to also support
 * other encodings.
 *
 * !! the function tries to leave alone URIs and words which
 * contain non-Latvian letters of the Latin alphabet (qwxy)
 *
 * !! The function is not perfectly accurate. It could only be, 
 * if it used Latvian language dictionary, and even then Latvian 
 * translit might be be hard to translate 100% accurately, eg. 
 * consider word "saaudzis".
 * 
 * The guiding principle here is - if you aren't sure - don't convert.
 *    
 * USAGE:  doTransform(form_element_to_check[, encoding_name ])
 * ------------------------------------------------------------
 * Will translate from translit to the specified encoding and
 * update the form element.
 * Default encoding is set in the global var default_encoding.
 * 
 * To add more encodings, just make another global variable with
 * the encoding name and fill it with character sequences that 
 * the according translit chars should be changed to. Note that
 * you have to use _ instead of - in encoding names.
 *
 * Example.
 *   <form name="thisform">
 *         ... <textearea name="tfield">
 *         <button onclick="doTranslit(thisform.tfield)">
 *  
 * (C) 2003 by Emils Klotins aka Grrr / tango23@inbox.lv .
 *    
 * This program is provided "as is" and without any warranty.
 * 
 * You may use, modify and distribute this program without restriction 
 * as long as you preserve the whole of this notice and do not receive 
 * payment for the program.
 *
 *
 * v2.0.2 2006-08-27 - shift and character exceptions (Indoom)
 * v2.0.1 2006-08-16 - fixed smilies issue, for example :chair:. text within : : will 
 * 						not be converted (Indoom)
 * v2.0.0 2006-08-13 - added exception for [quote][/quote], [code][/code], [php][/php]
 *					   and [html][/html] (Indoom)
 * v1.2.3 2003-12-12 - added exceptions for buljons, shçma, changed
 *                     triple wovel processing (eee -> eç etc.)
 *                     removed debug mode as default one from installation,
 *                     added some exceptions according to laacz' email...
 * 
 * v1.2.2 2003-12-12 - Fixed latin word handling to be case-insensitive
 *                     - added 'miljon' and some others as an exception,
 * 					   - fixed exception processing to not to drop the last char of word.
 * v1.2.1 2003-12-12 - fixed exception handling to correctly
 *                     process exceptions which don't start
 *                     at beginning of word.
 *                   - fixed wrapper func to alert and die 
 *                     when non-object passed.
 * 
 * v1.2 2003-12-12 - added documentation, fixed triple wovels.
 *                   tested on IE5.5 and Mozilla FB 0.7
 * 
 * Known bugs/TODOs:
 *   - can't distinguish between priedeeklji & valid start-of-word Latvian letters
 *     "saasinaats" etc. Default is to not to change.
 *   - exceptions can't really be overlapping. ie. more than one
 *     exception per word won't work at the moment.
 */

// the list of translit chars:
var translit=[ 
	/a{3}/g,/e{3}/g,/i{3}/g,/u{3}/g, /llj/g, /nnj/g,
	/Aa/g, /Ch/g, /Ee/g, /Gj/g, /Ii/g, /Kj/g, /Lj/g, /Nj/g, /Sh/g, /Uu/g, /Zh/g,
	/aa/g, /ch/g, /ee/g, /gj/g, /ii/g, /kj/g, /lj/g, /nj/g, /sh/g, /uu/g, /zh/g ];
// if we need to add special handling, it should be done IN FRONT of regular
// translit chars. Note that exceptions are handled before this anyway.

// change this to something else, if you add your own encodings
var default_encoding="windows_1257";
var debug=false; // change this to true to enable lots of debug alerts


var windows_1257=[
	"aâ", "eç", "iî", "uû", "ïï", "òò", 
	"Â", "È", "Ç", "Ì", "Î", "Í", "Ï", "Ò", "Ð", "Û", "Þ",
	"â", "è", "ç", "ì", "î", "í", "ï", "ò", "ð", "û", "þ"
	];

// not tested. I suspect this is the same as above table actually.
var utf_8=[
	"Pa\u0101", "Ne\u0113", "pa\u0101", "ne\u0113", 
	"\u0100", "\u010C", "\u0112", "\u0122", "\u012a", "\u0136", "\u013b", "\u0145", "\u0160", "\u016a", "\u017d",
	"\u0101", "\u010d", "\u0113", "\u0123", "\u012b", "\u0137", "\u013c", "\u0146", "\u0161", "\u016b", "\u017e" 
	];

/*
 Exceptions.
    
 This variable holds regexps of exceptions -- substrings which should never be 
 checked for translit chars. Currently it is only the beginnings of words
 starting on pa-, sa-, ne- and NOT followed by another 'a' or 'e'.

 As the exceptions can be regexps, then the LENGTH of matched chars is important
 and MUST be specified after the exception itself.
	
 Eg: this one deals with beginnings: saasinats, paaugstinats, neefektivs... 
 the part that should be left alone is 3 letters in length, yet the regexp
 matches 4 chars, in order to process correctly words like 'paaatrinats', 
 so length needs to be told explicitly to be 3:
 	 
*/
var exceptions=[ /^([ps]aa[^a]|nee[^\be])/i , 3,
				/^[mb]iljon/i, 6,
				/^[mb]iljard/i, 7,
				/buljon/i, 6,
				/(^|[^ghdr])iee[^\be]/i,3,
				/sheem/i, 2,
				/shçm/i, 3,
				/vakuum/i,6,
				/shift/i,5,
				/chara/,5
	  ];

var tl_prog="JS/translit";


function decodeChars(in_string,cp_reference)
{
	sym_count=cp_reference.length;
	result=in_string;
	
	if (typeof(in_string)!="string")
	{
		if (debug)
			alert(tl_prog+"error decodeChars: non-string given");
		return null;
	}
	
	for (counter=0;counter< sym_count; counter++)
		result=result.replace(translit[counter],cp_reference[counter]);
	return result;
}

// this should be called by a wrapper func, not directly
function decodeTranslit(input_string,encoding)
{
	var URI_definition=/((https?|ftp|telnet):\/\/|mailto:)[A-z0-9%:@#=_\/\.\?\+\-]+/;
	var latinword = /[qwxy]/i;
	var quotes = /(\[code\][\s\S]*?\[\/code\])|(\[quote[\s\S]*?\][\s\S]*?\[\/quote\])|(\[html\][\s\S]*?\[\/html\])|(\[php\][\s\S]*?\[\/php\])|:[^\s]+?:/i;

	var cp_reference=eval(encoding);
	
	if (cp_reference == null)
	{
		if (debug) alert(tl_prog+" error: Encoding "+encoding+" not found!");
		return null;
	}
	// sanity checks
	if (cp_reference.length != translit.length)
	{
		if (debug) alert(tl_prog+" error: Encoding "+encoding+" has different "+
			"size than the translit table, not all chars defined to be changed!");
		return null;
	}
	// apparently as text can contain both latin words and URIs we will need 
	// to work on word basis.
	
	// pârveidoju, lai izmanto regexu
	//var words=input_string.split(" ");
	
	var myregexp = /(\[code\][\s\S]*?\[\/code\])|(\[quote[\s\S]*?\][\s\S]*?\[\/quote\])|(\[html\][\s\S]*?\[\/html\])|(\[php\][\s\S]*?\[\/php\])|:[^\s]+?:|([^\s]+)|(\s+)/ig;
	var words = input_string.match(myregexp);
	
	var wc=words.length;
	var excount=exceptions.length;
	var dontAnalyze;
	var changeStart,changeEnd;
	var result= new Array();
	// analyzing each word...
	for (i=0; i< wc; i++)
	{
		//alert(words[i]);
		resultword="";
		dontAnalyze = new Array();
		// let the words with specific chars and URIs alone
		if (words[i].search(URI_definition)==-1 && words[i].search(latinword)== -1 && words[i].search(quotes)==-1)
		{
			// find out which parts of the word to actually
			// search -- constructs an array of substrings NOT 
			// to search
			if (debug) alert("Analyzing: "+words[i]);
			for (j=0;j<excount;j+=2)
			{
				excplace=words[i].search(exceptions[j]);
				if (excplace != -1)
				{
					if (debug) alert("Found exception "+exceptions[j]+" length: "+exceptions[j+1]);
					dontAnalyze.push(excplace, excplace+exceptions[j+1]);
				}
			}
			// now we finally change everything in the word parts
			// that are outside the dontAnalyze array:
			var daCount=dontAnalyze.length;
			changeStart=0;
			changeEnd=words[i].length;
			if (daCount > 0)
			{
				if (debug) alert("Found "+(daCount / 2)+" exceptions in "+words[i]+"\n"+dontAnalyze.join(","));
				resultword+=decodeChars(words[i].slice(0,dontAnalyze[0]),cp_reference);
				for (j=0; j<daCount; j+=2)
				{
					tmpvar1=words[i].slice(dontAnalyze[j],dontAnalyze[j+1]);
					if (debug) alert("processing exception, adding "+tmpvar1);
					resultword+=tmpvar1;
					changeStart=dontAnalyze[j+1];
					if (j+2 >= daCount)
					{
						changeEnd=words[i].length;
						break;
					}
					else
						changeEnd=dontAnalyze[j+2];
					tmpvar1=words[i].slice(changeStart,changeEnd);
					if (debug) alert("exception: adding processed "+tmpvar1);
					resultword+=decodeChars(tmpvar1,cp_reference);
				}
			}
			tmpvar1=words[i].slice(changeStart,changeEnd);
			if (debug) alert("adding the rest of the word:"+tmpvar1);
			resultword+=decodeChars(tmpvar1,cp_reference);
		}
		else
			resultword=words[i];
		result.push(resultword);
	}
	return result.join("");
}


// wrapper function, this is the one that browsers should refer to.

function doTranslit()
{
	if (typeof(arguments)!="object")
	{
		alert(tl_prog+" no object passed to doTranslit()");
		return false;
	}
	if (arguments.length < 1)
	{
		alert(tl_prog+" too few arguments to doTranslit");
		return false;
	}
	var encoding=(arguments.length > 1 ? arguments[1] : default_encoding);
	var input_object=arguments[0];
	
	if (! (input_object.type=="text" || input_object.type=="textarea"))
	{
		alert(tl_prog+" non-text form element passed to doTranslit? Can't "+
			" get value of "+input_object);
		return false;
	}
	
	var result=decodeTranslit(input_object.value,encoding);
	
	if (debug) alert(tl_prog+" final result, got: "+result);
	input_object.value=result;
	
	return false;
}
