/* Author:  Muhammad Abdulla (muhammad@yulghun.com)
 * Sorting routines for Uyghur text
 * Version: 1.0 (Apr. 20, 2009)
 * License: GPL
 */


var ACODE = gac('A');
var ZCODE = gac('Z');
var aCODE = gac('a');
var zCODE = gac('z');
var BPAD  = 0x0600;
var BMAX  = 0x06FF;
var HAMZA = gac('ئ'); 
var BLEN = 256;
var alpha_order = new Array(BLEN);

var sinited = false;
function sinit() {
   sinited = true;

   var n = 1;
   alpha_order[ gac('ا') - BPAD ] = n++;
   alpha_order[ gac('ە') - BPAD ] = n++;
   alpha_order[ gac('ب') - BPAD ] = n++;
   alpha_order[ gac('پ') - BPAD ] = n++;
   alpha_order[ gac('ت') - BPAD ] = n++;
   alpha_order[ gac('ج') - BPAD ] = n++;
   alpha_order[ gac('چ') - BPAD ] = n++;
   alpha_order[ gac('خ') - BPAD ] = n++;
   alpha_order[ gac('د') - BPAD ] = n++;
   alpha_order[ gac('ر') - BPAD ] = n++;
   alpha_order[ gac('ز') - BPAD ] = n++;
   alpha_order[ gac('ژ')  - BPAD ] = n++;
   alpha_order[ gac('س')  - BPAD ] = n++;
   alpha_order[ gac('ش')  - BPAD ] = n++;
   alpha_order[ gac('غ')  - BPAD ] = n++;
   alpha_order[ gac('ف')  - BPAD ] = n++;
   alpha_order[ gac('ق')  - BPAD ] = n++;
   alpha_order[ gac('ك')  - BPAD ] = n++;
   alpha_order[ gac('گ')  - BPAD ] = n++;
   alpha_order[ gac('ڭ')  - BPAD ] = n++;
   alpha_order[ gac('ل')  - BPAD ] = n++;
   alpha_order[ gac('م')  - BPAD ] = n++;
   alpha_order[ gac('ن')  - BPAD ] = n++;
   alpha_order[ gac('ھ')  - BPAD ] = n++;
   alpha_order[ gac('و')  - BPAD ] = n++;
   alpha_order[ gac('ۇ')  - BPAD ] = n++;
   alpha_order[ gac('ۆ')  - BPAD ] = n++;
   alpha_order[ gac('ۈ')  - BPAD ] = n++;
   alpha_order[ gac('ۋ')  - BPAD ] = n++;
   alpha_order[ gac('ې')  - BPAD ] = n++;
   alpha_order[ gac('ى')  - BPAD ] = n++;
   alpha_order[ gac('ي')  - BPAD ] = n++;
}

function is_alpha (cc) {
   if ( (ACODE <= cc && cc <= ZCODE) || (aCODE <= cc && cc <= zCODE) ) {
      return true;
   }

   return false;
}

function is_uy_alpha (cc) {
   if ( BPAD <= cc && cc <= BMAX && cc != gac('،') && cc != gac('؟') && cc != gac('؛') ) {
      return true;
   }

   return false;
}

function uystrcmp (s, d ) {
   var MAX = s.length + d.length; 
   var i = 0;
   var j = 0;
   var s_hamza = MAX;
   var d_hamza = MAX;
   var sch, dch;

   while ( i < s.length && j < d.length ) {
      sch = s.charCodeAt(i); 
      dch = d.charCodeAt(j); 

      /* HAMZA normally does not affect the alphabetical order. However, if two
       * strings differ by a HAMZA, such as suret and sur'et, the one with
       * HAMZA would be alphebetically smaller.
       */
      if ( sch == HAMZA && dch == HAMZA ) {
         i++;
         j++;
         continue;
      } else if ( sch == HAMZA ) {
         if ( s_hamza == MAX ) {
            s_hamza = i;
         }
         i++;
         continue;
      } else if ( dch == HAMZA ) {
         if ( d_hamza == MAX ) {
            d_hamza = j;
         }
         j++;
         continue;
      }

      // use alpha_order to sort Uyghur letters
      if ( BPAD <= sch && sch <= BMAX && BPAD <= dch && dch <= BMAX &&
           alpha_order[sch-BPAD] != 0 && alpha_order[dch-BPAD] != 0 ) {
         salpha = alpha_order[sch-BPAD];
         dalpha = alpha_order[dch-BPAD];
         if ( salpha < dalpha ) {
            return -1;
         } else if ( salpha > dalpha ) {
            return 1;
         }
      } else { // use char values for non-Uyghur letters
         if ( sch < dch ) {
            return -1;
         } else if ( dch < sch ) {
            return 1;
         }
      }

      i++;
      j++;
   }

   // Reaching here means we've reached the end at least one of the strings
   // Whichever string has more to go should be ranked alphabetically higher.
   if ( i == s.length && j < d.length ) {
      return -1;
   } else if ( i < s.length && j == d.length ) {
      return 1;
   }

   // If two strings differ only by HAMZA(s), the one that with lower Hamza index or doesn't have
   // hamza has higher alphabetical value 
   if ( s_hamza < d_hamza ) {
      return -1;
   } else if ( s_hamza > d_hamza ) {
      return 1;
   }

   // I do not think any of these two conditions hold true. But I haven't proved it.
   // Leaving them as they are does not harm.
   // If the beginnings are the same, the shorter string has lower alphabetical order.
   if ( s.length < d.length ) {
      return -1;
   } else if ( s.length > d.length ) {
      return 1;
   }

   return 0;
}

function get_phrases(str, by_line, in_uy) {
   var arr;
   var i, j; 
   var inword = false;

   if ( !by_line ) {
      by_line = false;
   }

   if ( !in_uy ) {
      in_uy = true;
   }

   arr = new Array();
   var word = "";

   j = 0;
   for ( i = 0; i < str.length; i++ ) {
      ch = str.charAt(i);
      cc = str.charCodeAt(i);

      if ( inword ) {
         if ( (by_line && ch != "\n" && ch != "\r") || 
              ((in_uy && is_uy_alpha(cc)) || (!in_uy && is_alpha(cc))) ) {
            word += ch;
         } else {
            arr[j++] = word;
            word = ""; 
            inword = false;
         }
      } else {
         if ( (by_line && ch != "\n" && ch != "\r") || 
              ((in_uy && is_uy_alpha(cc)) || (!in_uy && is_alpha(cc))) ) {
            inword = true;
            word = ch;
         }
      }
   }

   if ( word.length > 0 ) {
      arr[j] = word;
   }

   return arr;
}

function sort() {
   if ( !sinited ) {
      sinit();
   }

   var s = document.getElementById('stext');
   var d = document.getElementById('dtext');
   var b = document.getElementById('sortby');
   var u = document.getElementById('unique');
   var r = document.getElementById('reverse');

   var by = b.options[b.selectedIndex].value;
   var byline;

   if ( by == 'byline' ) {
      byline = true;
   } else {
      byline = false;
   }

   var phrases = get_phrases(s.value, byline); 

   phrases.sort(uystrcmp);

   if ( u.checked ) {
      phrases = unique(phrases);
   }

   if ( r.checked ) {
      phrases = phrases.reverse();
   }

   d.value = phrases.join("\r");
}

function unique(a)
{
   var r = new Array();
   o:for(var i = 0, n = a.length; i < n; i++)
   {
      for(var x = 0, y = r.length; x < y; x++)
      {
         if(r[x]==a[i]) continue o;
      }
      r[r.length] = a[i];
   }
   return r;
}

