<?php
/*
  -------------------------------------------------------------------------
 AllMyStats V1.80 - Statistiques site web - Web traffic analysis
 -------------------------------------------------------------------------
 Copyright (C) 2008 - 2013 - Herve Seywert
 copyright-GNU-xx.txt
 -------------------------------------------------------------------------
 Web:    http://allmystats.wertronic.com - http://www.wertronic.com
 -------------------------------------------------------------------------

For Yandex encodage possible UTF8 or cp1251 or KOI8-R or KOI8-U
*/

 
 // Detect charset (UTF-8, cp1251, KOI8-R/KOI8-U, cp866) of $string and converted in $charset
 function detect_convert_charset($string, $charset) { 
	
	$trace_debug = false;
	
	$encode = mb_detect_encoding($string, "UTF-8", true); // OK OK identifies always if UTF8
	if($encode == 'UTF-8') {
		$encoding_type = $encode;
	} else { // if not UTF8 --> detect if cp1251 or KOI8-R/KOI8-U or cp866
		$encoding_type = detect_cyr_charset($string);
	}

	if ($encoding_type == 'UTF-8' && strtolower($charset) <> strtolower($encoding_type)) {
		$string_decoded = mb_convert_encoding($string, $charset, 'UTF-8'); 
	} elseif ($encoding_type == 'cp1251' && strtolower($charset) <> strtolower($encoding_type)) {
		$string_decoded = mb_convert_encoding($string, $charset, 'cp1251');
	} elseif ($encoding_type == 'KOI8-R' && strtolower($charset) <> strtolower($encoding_type)) { 
		$string_cp1251 = convert_cyr_string($string, 'k', 'w');
		$string_decoded = mb_convert_encoding($string_cp1251, $charset, 'cp1251'); 
	} elseif ($encoding_type == 'cp866' && strtolower($charset) <> strtolower($encoding_type)) {
		$string_cp1251 = convert_cyr_string($string, 'a', 'w');
		$string_decoded = mb_convert_encoding($string_cp1251, $charset, 'cp1251');			 
	} elseif ($encoding_type == 'ISO-8859-5' && strtolower($charset) <> strtolower($encoding_type)) {
		$string_cp1251 = convert_cyr_string($string, 'i', 'w');
		$string_decoded = mb_convert_encoding($string_cp1251, $charset, 'cp1251');			 
	} else {
		// Add because yandex bug on some keyword, the referer is in cp1251 and utf-8 encodage eg: íàäðåç÷èê òåñòîâûõ çàãîòîâîê íòç-20ì
		if(strstr($string, 'yandex') && detects_double_encoding($string, 5) >= 4) { //$_SERVER['HTTP_REFERER']
			if($trace_debug) {
				$dbl_encoding_detected = 'utf8_decode + RE DECODE';
				write_trace($string, $encoding_type, $dbl_encoding_detected, utf8_decode($string));
			}
			$string_decoded = detect_convert_charset(utf8_decode($string), 'utf-8');			
		} else{
			$string_decoded = $string;
		}
	}

	if($trace_debug && !isset($dbl_encoding_detected)){
		write_trace($string, $encoding_type, '', $string_decoded);
	}
		
	return $string_decoded;
 }


// Detect charset (cp1251, KOI8-R/KOI8-U, cp866)
   define('LOWERCASE',3);
   define('UPPERCASE',1);
   function detect_cyr_charset($str) {
       $charsets = Array(
                         'KOI8-R' => 0,
                         'cp1251' => 0,
                         'cp866' => 0,
                         'ISO-8859-5' => 0,
                         'ISO-8859-1' => 0,	//'m'
                         'UTF-8' => 0	
                         );
       for ( $i = 0, $length = strlen($str); $i < $length; $i++ ) {
           $char = ord($str[$i]);
           //non-russian characters
           if ($char < 128 || $char > 256) continue;
		   
           //CP866
           if (($char > 159 && $char < 176) || ($char > 223 && $char < 242))
               $charsets['cp866']+=LOWERCASE;
           if (($char > 127 && $char < 160)) $charsets['cp866']+=UPPERCASE; //d

           //KOI8-R
           if (($char > 191 && $char < 223)) $charsets['KOI8-R']+=LOWERCASE; //k
           if (($char > 222 && $char < 256)) $charsets['KOI8-R']+=UPPERCASE; //k

           //WIN-1251 (cyrillique (Windows)
           if ($char > 223 && $char < 256) $charsets['cp1251']+=LOWERCASE; //w
           if ($char > 191 && $char < 224) $charsets['cp1251']+=UPPERCASE; //w

           //MAC allmystats not used
           //if ($char > 221 && $char < 255) $charsets['m']+=LOWERCASE; //m
           //if ($char > 127 && $char < 160) $charsets['m']+=UPPERCASE; //m
           
           //ISO-8859-5 (cyrillique (ISO) (note: if utf8 --> donne ISO-8859-5 ??)
           if ($char > 207 && $char < 240) $charsets['ISO-8859-5']+=LOWERCASE; //i
           if ($char > 175 && $char < 208) $charsets['ISO-8859-5']+=UPPERCASE; //i

           //UTF-8 NOT work
           //if ($char > 256) $charsets['UTF-8']+=LOWERCASE; //i
           //if ($char > 175 && $char < 208) $charsets['UTF-8']+=UPPERCASE; //i

       }
       arsort($charsets);
       return key($charsets);
   }


 // Detects the double encoding cp1251 in UTF8
 // Counts the number of characters which follow in the range ('C2A1' 'C3BF') 
 function detects_double_encoding($str, $max_it='') {

	$char_which_follow = 0;
	$max_which_follow = 0;
	$save_i = 0;

	$exp_str = explode('?', $str);
	if(isset($exp_str[1]) && preg_match('/[\x{00A1}-\x{00FF}]/u', $str)) { // Test on Unicode code point
		for ($i=0; $i<mb_strlen($exp_str[1], 'UTF-8'); $i++) {
			$c = mb_substr($exp_str[1], $i, 1, 'UTF-8');
			$char = utfCharToHexNumber($c);
			//echo 'c: '.$c.' char= '.$char.'<br>';

			if (($char > 'C2A1' && $char < 'C3BF')) { // Test on Hex UTF-8 encoding table
				if($save_i == $i-1) {
					$char_which_follow++;
				} else {
					$char_which_follow = 0;
				}
				$save_i = $i;
			} 
			if($max_which_follow < $char_which_follow) {
				$max_which_follow = $char_which_follow;
				if($max_which_follow == $max_it) {
					break;
				}
			}
		}
	}

	return $max_which_follow;
 }


 function utfCharToHexNumber($char) {
    $i = 0;
    $number = '';
    while (isset($char{$i})) {
        $number.=  strtoupper(dechex(ord($char{$i})));
        ++$i;
    }
    return $number;
 }


function write_trace($string, $encoding_type, $dbl_encoding_detected, $string_decoded) {
	// PHP5.4 Suppress DateTime warnings (if not set in php.ini) => date_default_timezone_set -> UTC
	if(function_exists("date_default_timezone_set") and function_exists("date_default_timezone_get")) {
		date_default_timezone_set(@date_default_timezone_get());
	}

	$Fnm = @dirname(__FILE__).'/test_charset.html';

	$inF = @fopen($Fnm,"a");

	if(isset($string)) {
		$string = $string;
	} else {
		$string = '';
	}

	$texte = 
	"Date:".date("d-m-Y")." ".date("H:i:s")." ip=".$_SERVER['REMOTE_ADDR']."\n".
	"Reverse DNS : ". gethostbyaddr($_SERVER['REMOTE_ADDR']). "\n".
	"User Agent : ".$_SERVER['HTTP_USER_AGENT']. "\n".
	"Referer initial :  ".$string. "\n".
	"Encoding_type :  ".$encoding_type. "\n";
	if(isset($dbl_encoding_detected) && $dbl_encoding_detected <>'') {
		$texte .= "Double encoding detected: ".$dbl_encoding_detected. "\n";
	}

	$texte .= 
	"String_decoded :  ".$string_decoded. "\n".
	"\n\n";

	@fwrite($inF,$texte);
	@fclose($inF); 
}

?>
