<?php

/**
 * Seltzlab - web applications development
 * http://www.seltzlab.com
 * mail: info@seltzlab.com
 * skype: seltzlab
 * 
 * This work is licensed under Creative Commons Attribution-Share Alike 3.0
 * http://creativecommons.org/licenses/by-sa/3.0/us/deed.en
 */

/**
 * Class to analize semantic data from a html trunk 
 */
class seltz_analyzer
{
    /**
     * turn on/off debug output
     * @var boolean
     */
    private $debug = false;

    /**
     * seltz_analyzer class version
     * @var number
     */
    public $version = 0.4;

    /**
     * autodetected document type, can be url or string
     * @var string
     */
    public $doctype;
    /**
     * document text
     * @var string
     */
    public $doctext;
    /**
     * doucment language using 2 chars
     * @var string 
     */
    public $doclang;
    /**
     * if doctype is url, here the document url
     * @var string
     */
    public $docurl;
    /**
     * the document encoding, if not specified try to get it from html meta informations. If fails set to iso-8859-1
     * @var string
     */
    public $encoding;

    /**
     * list of detected words
     * @var array
     */
    public $words;

    /**
     * constants defined for our "good tags" weight in score computation 
     */
    const weight_ucfirst = 4;
    const weight_pspell = 3;
    const weight_strong = 5;
    const weight_em = 5;
    const weight_span = 4;
    const weight_p = 1;
    const weight_cite = 1;
    const weight_acronym = 2;
    //const weight_div = 0;

    public $words_delimiter = array(' ', ';', ',', '.', ':', '\'', '"', '!', '?', '+', '(', ')', '[', ']', '/', '\\', '=', "\n", "\r", "\t");
    
    /**
     * array of tags we assume are good for a semantic meaning
     * @var array
     */
    public $goodTags = array(
        'strong' => seltz_analyzer::weight_strong,
        'b' => seltz_analyzer::weight_strong,
        'em' => seltz_analyzer::weight_em,
        'i' => seltz_analyzer::weight_em,
        'span' => seltz_analyzer::weight_span,
        'p' => seltz_analyzer::weight_p,
        'acronym' => seltz_analyzer::weight_acronym,
        'cite' => seltz_analyzer::weight_cite,
        //'div' => seltz_analyzer::weight_div
    );
    /**
     * array of tags we assume are particoulary good for a semantic meaning
     * @var array
     */
    public $reallyGoodTags = array(
        'strong',
        'b',
        'em',
        'i',
        'span'
    );
    /**
     * array of tags we assume are not good for a semantic meaning. Remember, at the moment all is finalized in getting some keywords linked on wikipedia
     * @var array
     */
    public $badTags = array(
        //'a',
        'h1',
        'h2',
        'h3',
        'h4',
        'h5',
        'h6',
        'li',
        'div'
    );

    /**
     * pspell object
     * @var object
     */
    private $pspell;

    /**
     * @param string $doc url of the document or xhtml string
     * @param array $config configuration array. key debug bool default false, key encoding string default iso-8859-1, key doclang 2 char string default en
     */
    function __construct($doc, $config)
    {
        if (isset($config['debug']))
            $this->debug = $config['debug'];

        if (preg_match("/^([A-z]*)(:\/\/)/iU", $doc)) {
            $this->doctype = 'url';
            $this->docurl = $doc;
            $this->doctext = file_get_contents($doc);
        }
        else {
            $this->doctype = 'string';
            $this->doctext = $doc;
        }

        if ($config['encoding']) 
            $this->encoding = $config['encoding'];
        else {
            //preg_match_all('/<\?xml\s+(version="(.*)")?\s+(encoding="(.*)")?\s*\>/iU', $this->doctext[0], $match);
            preg_match("/<meta[^>]*http-equiv[^>]*charset=(.*)(\"|')\s*\/>/Ui", $this->doctext, $match);

            if ($match[1]) // encoding found
                $this->encoding = $match[1];
            else
                $this->encoding = 'iso-8859-1';
        }

        if (isset($config['doclang']))
            $this->doclang = $config['doclang'];
        else
            $this->doclang = 'en';

        if (function_exists(pspell_new))
            $this->pspell = pspell_new($this->doclang, "", "", "", PSPELL_FAST);
        else
            $this->pspell = false;

        $this->out_debug(array($this->doclang, $this->doctype, $this->encoding, $goodTags));
    }

    /**
     * Build the word list
     * @return score sorted words array
     */
    public function buildStruct()
    {
        // analyze html's meanings
        $this->words = $this->analyze_xml();

        asort($this->words);

        $this->out_debug('RESULTS -------------------------');
        $this->out_debug($this->words);

        return $this->words;
    }

    private function analyze_string($text)
    {
        $words = array();

        // analyze text without html and attach a final point to make the last word recognized
        $temptxt = strip_tags($text).'.';
        $current_word = '';

        $this->out_debug("Analyze string:".$temptxt.", strlen:".strlen($temptxt));

        for ($i = 0; $i < (strlen($temptxt) + 1); $i++) {

            $asciichar = ord($temptxt{$i});
            if ($asciichar < 32 || $asciichar > 126) // || 165 ??
                continue;

            if (in_array($temptxt{$i}, $this->words_delimiter)) {

                if ($this->isReallyAString($current_word)) {
                    $this->out_debug("\t$current_word is really a string!");

                    $word_index = strtolower($current_word);

                    $this->out_debug("\tFound word: $current_word, index: ".$word_index);

                    // if first char is uppercase maybe is a candidate word...
                    if (strlen($current_word) != $i && $current_word === ucfirst($current_word) && !$this->previousWordDelimited($temptxt, $current_word)) {
                        $this->out_debug("\tMaybe $current_word can have the uppercase score...");
                        
                        $words[$word_index] += seltz_analyzer::weight_ucfirst;
                        $this->out_debug("\tFirst char uppercase, score:".$words[$word_index]);
                            
                        // if first char uppercase && the previous char was a blank space && the previous word was uppercase too
                        if ($temptxt{($i - strlen($current_word) - 1)} == ' ' && strlen(trim($last_word)) > 0 && $last_word === ucfirst($last_word)) {
                            // than maybe is highly possible that the two words goes togheter
                            $word_index = $last_word_index.' '.$word_index;

                            if (!isset($words[$word_index]))
                                $words[$word_index] = 0;
                            $words[$word_index] += seltz_analyzer::weight_ucfirst + $words[$last_word_index];
                            unset($words[$last_word_index]);
                            $this->out_debug("\tPrevious was uppercase.. get togheter: $word_index = $last_word_index + ".$current_word);
                        }
                    }

                    // if spell is not recognized maybe is a candidate word...
                    if (strlen($current_word) > 1 && $this->pspell !== false && !pspell_check($this->pspell, $current_word)) {
                        if (!isset($words[$word_index]))
                            $words[$word_index] = 0;
                        $words[$word_index] += seltz_analyzer::weight_pspell;
                        $this->out_debug("\tWord not found in pspell, score:".$words[$word_index]);
                    }

                    $last_word_index = $word_index;
                    $last_word = $current_word;
                }
                $last_word_index = '';
                $last_word = '';
                $current_word = '';
                continue;
            }
            else {
                $current_word .= $temptxt{$i};
            }
        }

        $this->out_debug('Analyze string ended:');
        $this->out_debug($words);

        return $words;
    }

    private function analyze_xml()
    {
        $this->out_debug('PARSE XML.............');

        $insideAt = array();
        $tags = array_keys($this->goodTags);

        $afterHeadingMoltiplicator = 1;
        
        $xmlwords = array();

        $reader = new XMLReader();
        $reader->XML($this->doctext, $this->encoding);
        $reader->setParserProperty(XMLReader::SUBST_ENTITIES, true);
        $reader->setParserProperty(XMLReader::LOADDTD, true);
        $reader->setParserProperty(XMLReader::VALIDATE, false);
        
        while ($reader->read()) {
            $el = strtolower($reader->name);
            $stopUntil = false;
            $this->out_debug('xml node:'.$reader->nodeType.', '.$el.', currentElement: '.$currentElement.', last:'.$lastElement);

            switch ($reader->nodeType) {
                case XMLREADER::ELEMENT:
                $this->out_debug('start '.$el);
                if (in_array($el, $tags))
                    $currentElement = $el;
                else if (in_array($el, $this->badTags))
                    $insideBad = $el;
                
                break;

                case XMLREADER::END_ELEMENT:
                $this->out_debug('end '.$el);
                if ($insideBad == $el)
                    $insideBad = false;
                $lastElement = $el;
                $currentElement = '';
                
                // if heading element
                if ($el{0} == 'h' && is_numeric($el{1})) {
                    $afterHeadingMoltiplicator = 1 / $el{1};
                    $this->out_debug('After heading '.$el.' moltiplicator '.$afterHeadingMoltiplicator);
                }
                break;

                case XMLREADER::TEXT:
                if ((!$currentElement || $insideBad) && !in_array($lastElement, $tags))
                    break;

                if (in_array($currentElement, $this->reallyGoodTags))
                    $this->out_debug('inside a really good tag:'.$currentElement);

                $this->out_debug('analyze node content:'.$reader->value);
                $words = seltz_analyzer::analyze_string($reader->value);

                if (!count($words))
                    continue;

                // if inside a really good tag we take the entire string as index
                $score = 0;
                if (in_array($currentElement, $this->reallyGoodTags)) {
                    $index = '';
                    foreach ($words as $word => $s) {

                        $this->out_debug('really good tag: word '.$word.', add '.$s.' to '.$score);
                        $score += $this->goodTags[$currentElement] + $s;
                        if (isset($xmlwords[$word])) {
                            $score += $xmlwords[$word];
                            unset($xmlwords[$word]);
                        }
                        $index .= $word.' ';

                        unset($words[$word]);
                    }

                    $words[substr($index, 0, -1)] = round($score * $afterHeadingMoltiplicator, 2);
                }
                else {
                    foreach ($words as $word => $score) {
                        $this->out_debug('good tag: word '.$word.', add '.$this->goodTags[$currentElement].' to '.$score);
                        $words[$word] = round(($this->goodTags[$currentElement] + $score) * $afterHeadingMoltiplicator, 2);
                    }
                }

                $xmlwords = array_merge($xmlwords, $words);
                break;
           }
        }

        return $xmlwords;
    }

    /**
     * @return the already computed word list
     */
    public function get_words()
    {
        return $this->words;
    }

    /**
     * @return the class version
     */
    public function get_version()
    {
        return $this->version;
    }

    private function isReallyAString($word)
    {
        if (strlen($word) < 2)
            return false;

        if (ctype_alpha($word) || (is_numeric($word) === false && strtotime($word) === false && ctype_digit($word) === false))
            return true;
        else
            return false;
    }

    private function previousWordDelimited($text, $word)
    {
        if (($w_pos = strpos($text, $word)) === false)
            return false;
            
        $this->out_debug("*** check if $word in $text was previous delimited");
        if ($text{($w_pos - 1)} == ' ') {
            $this->previousWordDelimited(substr($text, 0, $w_pos - 1).$word, $word);
        }
        else {
            if (in_array($text{($w_pos - 1)}, $this->words_delimiter)) {
                $this->out_debug("*** it was");
                return true;
            }
            else {
                $this->out_debug("*** it was not");
                return false;
            }
        }
    }
    
    private function out_debug($o)
    {
        if (!$this->debug)
            return;

        static $debug_c = 0;
        echo "\n<pre style=\"color:green\">";

        $debug_c++;
        if (is_array($o)) {
            foreach ($o as $k => $tok) {
                echo "\n{$debug_c}] $k => "; print_r($tok);
                $debug_c++;
            }
        }
        else
            echo "\n{$debug_c}] $o";
        echo "</pre>\n";
    }
}

function unit_test($article, $lang) {

    echo "<h1>Analizing for:</h1><div>".htmlspecialchars($article)."</div>";

    $mydoc = new seltz_analyzer($article, array('doclang' => $lang, 'debug' => true));
    $words = $mydoc->buildStruct();
    unset($mydoc);

    // here produce another analysis using Open Text Summarizer [see http://libots.sourceforge.net] output
    /*$summary = "<div>".strip_tags(shell_exec('echo "'.$article.'"|ots --ratio 10 -d '.$lang))."</div>";
    $mydoc = new seltz_analyzer($summary, array('doclang' => $lang, 'debug' => true));
    $sumwords = $mydoc->buildStruct();

    // and than sum the two array's score
    foreach ($words as $word => $score)
        $words[$word] = $sumwords[$word] + $score;

    foreach ($sumwords as $word => $score)
        if (!isset($words[$word]))
            $words[$word] = $sumwords[$word];
            
    asort($words);
    */
    
    echo "<pre>";print_r($words);echo "</pre>";
}

//unit_test('http://www.w3.org/WAI/', 'en');

?>
