<?
 // Copyright (c) 2001 Sergey Frolovithev (siteindexer@idesign.ru). All rights reserved.
 // class for indexing web pages
 // created by: Sergey Frolovithev
 // created: 26.02.2001

 class cindexer
  { var $log;
    var $host;
    var $sw=array("and","or","no","on");
    var $mwl=2;
    var $needlog=false;

    function log($message,$type)
     { if ($this->needlog)
        $this->log->write($message,$type);
     }

    function cindexer($host,$log)
     { global $DB; // database class
       global $CONFindexerlog; //need log?
       global $insPageQuery,$selWordQuery,$insWordQuery,$insWordToPageQuery,$updWordToPageQuery,$bhost,$updPageInfoQuery; // query
       global $CONFuntitle,$CONFspecialbegin,$CONFspecialend; // default title and special comment for skipping header and footer
       global $CONFhost,$CONFmin_word_lenght,$CONFstop_words; // start host, max word lenght, stop words
       global $badurls; // ignoreurls

       $host=str_replace("http://","http:::",$host);
       $host=str_replace("//","/",$host);
       $host=str_replace("//","/",$host);
       $host=str_replace("//","/",$host);
       $host=str_replace("http:::","http://",$host);

       $this->host=$host;
       $this->log=$log;
       $this->mwl=$CONFmin_word_lenght;
       $this->sw=$CONFstop_words;
       $this->needlog=$CONFindexerlog;


       // http class init
       $h=new chttp;

       // url parse
       $p=parse_url($host);
       if ($p["path"]=="")
        $p["path"]="/";
       // query
       if ($p["query"]!="")
        $p["path"].="?".$p["query"];

       $error=$h->Open(array("HostName"=>$p["host"]));

       // check if in black urls list
       for ($b=0;$b<count($badurls);$b++)
        {
          if (strstr($host,$badurls[$b]))
           {
             $error="Host: $host is in bad urls list!";
             return;
           }
        }

       if ($error=="")
        { $error=$h->SendRequest(array("RequestURI"=>$p["path"],
                                       "Headers"=>array("Host"=>$p["host"],
                                       "User-Agent"=>"Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt)",
                                       "Pragma"=>"no-cache")));
          if ($error=="")
           { $fbody="";

             if ($error=="")
              {
                // check if already indexed page
                $error=$DB->insMQuery($insPageQuery,array("page"=>$host));
              }

             if ($error=="")
              { $page_id=$DB->insert_id;
                echo "[".date("H:i:s d-m-Y")."] ".$host."\n";
                // getting  page header status
                $headers=array();
                $error=$h->ReadReplyHeaders(&$headers);
                if (is_array($headers))
                 { reset($headers);
                   $p=0;
                   while(list($k,$v)=each($headers))
                    { //echo $k."\n\n";
                      if ((strstr($k,"404"))&&($p==0))
                       { $DB->updMQuery($updPageInfoQuery,array("id"=>$page_id,"title"=>"404 not found","descr"=>"","status"=>"404"));
                         return;
                       }
                      if ($k=="location")
                       { $tpg=parse_url($v);
                         if ($tpg["host"]=="")
                          $v=$CONFhost."/".$v;
                         if (strstr($v,$CONFhost))
                          $I=new cindexer($v,$this->log);
                         return;
                       }
                      $p++;
                      //break;
                    }
                 }

                for(;;) // reading body
                 { $error=$h->ReadReplyBody(&$body,1000);
                   if ($error!="" || strlen($body)==0)
                    break;
                   $fbody.=$body;
                 }
                // connection close
                $h->Close();

                $urls=$this->parseHtml($fbody);

                //title
                $obody=str_replace("\n"," ",$fbody);
                if (preg_match("/<title>(.*)<\/title>/i",$obody,$arr))
                 { $title=$arr[1];
                 }
                else
                 { $title=$CONFuntitle;
                 }

                $tags=new ctags();

                // descr
                if (preg_match("/".$CONFspecialbegin."(.*)/i",$obody,$arr))
                 { if (preg_match("/(.*)".$CONFspecialend."/i",$arr[1],$arr))
                    {
                      $descr=substr($tags->ctags($arr[1]),0,252)."...";
                      $fbody=$arr[1];
                    }
                   else
                    {
                      $descr=substr($tags->ctags($arr[1]),0,252)."...";
                      $fbody=$arr[1];
                    }
                 }
                else
                 $descr=substr($tags->ctags($obody),0,252)."...";
                $this->log($page_id."|".$host."|".$title,0);
                $DB->updMQuery($updPageInfoQuery,array("id"=>$page_id,"title"=>$title,"descr"=>$descr,"status"=>200));

                $words=$tags->ctags($fbody);
                unset($fbody);
                $words=str_replace("&nbsp;"," ",$words);
                $words=str_replace("&copy;","",$words);
                $words=str_replace("&reg;","",$words);
                $words=str_replace("\t"," ",$words);
                $words=str_replace("\n"," ",$words);
                $words=chop($words);
                $words=split(" ",$words);

                for ($i=0;$i<count($words);$i++)
                 { $words[$i]=trim($words[$i]);
                   if (($words[$i]!="")&&(strlen($words[$i])>$this->mwl)&&(!in_array($words[$i],$this->sw)))
                    { $error=$DB->insMQuery($insWordQuery,array("word"=>$words[$i]));
                      if ($error!="")
                       { $w=$DB->selMRow($selWordQuery,array("word"=>$words[$i]));
                         $word_id=$w[id];
                       }
                      else
                       $word_id=$DB->insert_id;
                      $error=$DB->insMQuery($insWordToPageQuery,array("word_id"=>$word_id,"page_id"=>$page_id));
                      if ($error!="")
                       { $DB->updMQuery($updWordToPageQuery,array("word_id"=>$word_id,"page_id"=>$page_id));
                       }
                    }
                 }
                // href parsing and recursive call
                if (is_array($urls))
                 {
                   reset($urls);
                   while(list($k,$v)=each($urls))
                    { if ( (!strstr($v["HREF"],"mailto:")) && (!strstr($v["HREF"],"#")) )
                       { $v["HREF"]=str_replace("'","",$v["HREF"]);
                         $v["HREF"]=str_replace("\"","",$v["HREF"]);
                         $tpg=parse_url($v["HREF"]);
                         if ($tpg["host"]=="")
                          $v["HREF"]=$CONFhost."/".$v["HREF"];
                         if (strstr($v["HREF"],$CONFhost))
                          { $I=new cindexer($v["HREF"],$this->log);
                          }
                       }
                    }
                 }
              }
             else
              { $h->Close();
                return;
              }
           }
          else
           { $h->Close();
             $this->log("[ERROR] Can't get \"".$p["host"].$p["path"]."\"!",1);
             return;
           }
        }
       else
        { $h->Close();
          $this->log("[ERROR] Can't connect to \"".$p["host"]."\"!",1);
          return;
        }
     }

    function parseHtml ($str)
     { $str=split("\n",$str);
       if (is_array($str))
        { while(list($k,$v)=each($str))
           { if (preg_match("/(.*)<a(.*)href=(.*)>(.*)/i",$v,$arr))
              { $t=split(">",$arr[3]);
                $t[0]=strtok($t[0]," ");
                $urls[]["HREF"]=$t[0];
              }
            }
        }
       return $urls;
     }


    function parseHtmlUnderConstruction($s_str)
     { $i_indicatorL = 0;  
       $i_indicatorR = 0;  
       $s_tagOption = "";  
       $i_arrayCounter = 0;  
       $a_html = array();  

       // Search for a tag in string  
       while( is_int(($i_indicatorL=strpos($s_str,"<",$i_indicatorR))) )
        { // Get everything into tag...  
          $i_indicatorL++;  
          $i_indicatorR = strpos($s_str,">", $i_indicatorL);  
          $s_temp = substr($s_str, $i_indicatorL, ($i_indicatorR-$i_indicatorL) );  
          $a_tag = explode( ' ', $s_temp );  

          // Here we get the tag's name  
          list( ,$s_tagName,, ) = each($a_tag);  
          $s_tagName = strtoupper($s_tagName);  

          // Well, I am not interesting in <br>, </font> or anything else like that...  
          // So, this is false for tags without options.  
          $b_boolOptions = is_array(($s_tagOption=each($a_tag))) && $s_tagOption[1];  
          if ($b_boolOptions)
           { // Without this, we will mess up the array  
             $i_arrayCounter = (int)count($a_html[$s_tagName]);

             // get the tag options, like src="htt://". Here, s_tagTokOption is 'src' and s_tagTokValue is '"http://"' 
             do
              { $s_tagTokOption = strtoupper(strtok($s_tagOption[1], "="));  
                $s_tagTokValue  = trim(strtok("="));  
                $a_html[$s_tagName][$i_arrayCounter][$s_tagTokOption] = $s_tagTokValue;  
                $b_boolOptions = is_array(($s_tagOption=each($a_tag))) && $s_tagOption[1];  
              }
             while( $b_boolOptions );  
           }  
        }  
       return $a_html;  
     }

  }
?>