﻿using System;
using System.Web;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Linq;
 
public class RowSchema
{    
    public string URL { get; set; }
    public bool Visited  { get; set; }
    public string Title { get; set; }
    public int Count{ get; set; } 

    public RowSchema(){}

    public RowSchema(string URL, string Title)
    {    this.URL = URL;
        this.Title = Title;
        this.Count = 0; 
    } 
     
    // Note: we need to provide overrides for the "key" of the HashSet
     
    public override int GetHashCode()
    { return  URL.GetHashCode(); }

    public override bool Equals(object obj)
    { return Equals(obj as RowSchema);  }

    bool Equals(RowSchema x)
    {  return (x != null) && x.URL.Equals(this.URL); }
     
}


public class Grabber
{   public  HttpSessionStateBase  Session;  
    public Grabber()
    {  }
     

    public int MaxServerTime = 10; // Time alloted for server-side processing

    public string blockedDomains = "google.com,microsoft.com,webstatsdomain.com,eyoonk.com,uz-translations.net,filestube.com,babylon.com"; 

    Random rand = new Random();
       
    StreamWriter logFile = null; // For debugging only
      
      
   public bool Search(string searchWords, int MaxRecords, HashSet<RowSchema> urlTable)
   {   
       // Uncomment next line for logging
      // logFile = new StreamWriter(Server.MapPath("") + @"\log.txt");
          
        DateTime t1 = DateTime.Now;
             
        while (true)
        {  if ((DateTime.Now - t1).TotalSeconds > MaxServerTime) break;

            string SearchUrl = String.Format("http://www.bing.com/search?q={0}", HttpUtility.UrlEncode(searchWords)) + "&first=" + rand.Next(500);
            string parentURL = "";

             RowSchema row1 = null;
             if ((urlTable.Count > 5) && (rand.NextDouble() < 0.5))
             {    
                var foundRows = urlTable.Where(p => p.Visited== false).ToList<RowSchema>(); 

                if ((foundRows.Count == 0) && (urlTable.Count == MaxRecords))
                   return true; // All visited; use to disable refresh timer
                  
                if (foundRows.Count > 0)
                {   row1 = foundRows[0];
                    SearchUrl =  row1.URL;
                    row1.Visited = true;   // Optimistic that call to FetchURL() will be OK
                    parentURL = SearchUrl; 
                }
             }
                 
             string searchData = FetchURL(SearchUrl);

             if (searchData.StartsWith("Error"))
             {  if (row1!= null)
                { urlTable.Remove(row1); } 
                continue;
            }
                   
           //  Debugging: Response.Write(searchData);  return;

            int i = searchData.IndexOf("<body", StringComparison.InvariantCultureIgnoreCase);
            if (i == -1)
            {   if (row1 != null)
                {  urlTable.Remove(row1); }
                continue; 
            } 

            string htmlHead = searchData.Substring(0,i-1);
            string htmlBody = searchData.Substring(i).ToLower(); 

            if (row1 != null)
            {  string Title = GetTitle(htmlHead, searchWords);
               if (Title == "")
                {  urlTable.Remove(row1);
                   continue;
                }

                int Count = CountWords(htmlBody,searchWords);
                if (Count == 0)
                {  urlTable.Remove(row1);
                   continue;
                }

                row1.Title =Title;
                row1.Count = Count ;  
           }
  
             HashSet<string> urlSet = GrabURLs(htmlBody, parentURL); 

             foreach (string s in urlSet)
             {  if (urlTable.Count == MaxRecords) break;

                row1 = new RowSchema();
                row1.URL = s;
                row1.Visited = false;

                 // Note: HashSet collection guarantees uniqueness (no duplicate)
                 // based on the override for Equals()
                 // row1 won't be added if there is match in urlTable 
               
                 urlTable.Add(row1);
             }
          } 
           
           if (logFile != null) logFile.Close(); 
           return false;
        }
      

      HashSet<string> GrabURLs(string htmlData, string parentURL)
       {  // Returns a set of absolute URLs from URLs found in htmlData  
            
            DateTime t1 =  DateTime.Now;

            if (logFile != null) logText("Entered:" + t1);  

            HashSet<string> urlSet = new  HashSet<string> ();  // Our set for URLs

            int pos = 0; // Track "From" position for searching htmlData 
            while (true)
            { 
              int i = htmlData.IndexOf("href=", pos);

              if (i < 0) break;
              i = i + 5;   // Skip over href  
            
              if (htmlData[i] =='"' || htmlData[i]== '\'') i++; 

              // Search for " (0x22)  or ' (0x27)  or sp  or >  
              string delim = "'\">"; 
              int j = i;
              while (delim.IndexOf(htmlData[j]) == -1) j++; 

              pos = j;// Move forward on searching htmlData 
              string URL = htmlData.Substring(i, j - i); 
             
              if (URL.Length < 8) continue;
              if (URL.IndexOf('#') != -1) continue;  // Skip bookmarks 

              if ( URL.StartsWith("://")) URL = "http"+URL; 

              URL = URL.Trim().ToLower();
              string part1 = URL.Substring(0, 5);

               if (parentURL == "")
               { if (part1 != "http:")  continue; }
             //  else if (part1 == "http:") continue;  
                  
               if (URL.IndexOf("http",1)  != -1) continue;
                 
               if (URL.IndexOf("torrent") != -1) continue;
               if (URL.EndsWith(".pdf")) continue;
               if (URL.EndsWith(".zip"))  continue;
               if (URL.EndsWith(".rar")) continue;
               if (URL.IndexOf("showthread") != -1) continue;
               if (URL.IndexOf("javascript") != -1) continue;

               if (URL.StartsWith("http://"))
               {  if (URL.IndexOf('/', 8) == -1) URL = URL + '/';
                    goto Cont; 
               }
             
                // Convert relative URL to absolute
               if (parentURL != "")
               {  i = parentURL.IndexOf("//");
                  i= i+2;
                  j= parentURL.IndexOf('/', i);
                  string pathtohost, pathlessfile;
                  if (j != -1)
                  {  pathtohost = parentURL.Substring(0, j); 
                      int k = parentURL.LastIndexOf('/');
                      pathlessfile = parentURL.Substring(0, k + 1);  // include last slash
                  }

                  else
                  {   pathtohost = parentURL;
                      pathlessfile = parentURL + "/";
                  }

                 //  if (URL.StartsWith("/") || URL.StartsWith(".."))
                   if (URL.StartsWith("/")  )  URL = pathtohost +URL; 
                   else  URL =  pathlessfile +URL; 
               }
                 
                // Handle .. in URL
                i =  URL.IndexOf("..");
                if  (i != -1)
                {   j =i-1;
                    if (URL[j] == '/' ) j--;
                    if (URL.LastIndexOf('/') > 10)
                    {  while (URL[j] != '/') j--; }

                    URL = URL.Substring(0,j) + URL.Substring(i+2);
                }
                 
              // Parsing above may still get bad URLs that should be ignored
            Cont:
                if (URL.IndexOf("..") > 0) continue;  
              
                if (URL.IndexOf("//", 9) != -1) continue;
                  
              //  if (!URL.StartsWith("http://" )) continue;
                if (parentURL != "")
                {   
                    if (parentURL + "index.php" == URL) continue;
                    if (parentURL + "index.htm" == URL) continue;
                    if (parentURL + "index.html" == URL) continue;
                    if (parentURL + "/" == URL) continue;
                }
                  
                // Extract Domain and filter 

                i =  URL.IndexOf("//");
                i = i + 2;
                j = URL.IndexOf("/",i);
                if (j == -1) j = URL.Length;
                string domain = URL.Substring(i, j - i);
                j = domain.IndexOf('.');
                if (domain.IndexOf('.',j+1) !=-1) domain = domain.Substring(j+1);
                
                //   Response.Write("domain:" + domain + "*");
                if (blockedDomains.IndexOf(domain) != -1) continue;

                // Check if parentURL and URL both have a common part ending with? 
                i = URL.IndexOf('?');
                if (i != -1)
                {  j = parentURL.IndexOf('?');
                    if (j != -1)
                        if (URL.Substring(i) == parentURL.Substring(j)) continue;
                }

                  if (URL.Length > 100) continue; 
                 if (URL.Length < 15) continue; 

                if (CheckRepeated(URL,urlSet))  urlSet.Add(URL); 
              //  if ((urlSet.Count >= 10)  && (parentURL !="")) break; 
                  if  (urlSet.Count >= 10)  break; 
             }
              
             if (logFile!=null)  logText("Exited-duration (ms):" + (DateTime.Now - t1).TotalMilliseconds); 

            return urlSet;
         }
     

        Boolean CheckRepeated(string URL, HashSet<string> urlSet)
        {  foreach (string s in urlSet)
            {   if  (Math.Abs(s.Length - URL.Length) <= 1)
                {   if (URL.StartsWith(s))  return false;
                   else if (s.StartsWith(URL))  return false;
                }
            }
            return true;
        }

        string  GetTitle(string htmlHead, string searchWords)
        {   
            int i, j;
            i = htmlHead.IndexOf("<title>",StringComparison.InvariantCultureIgnoreCase);

            if (i == -1) return "";
           
            i = i + 7;
            j = htmlHead.IndexOf("</title>", i, StringComparison.InvariantCultureIgnoreCase);
            if (j==-1) return "";

            string Title = htmlHead.Substring(i, j - i); 
                     
             if (Title.Length > 150) Title= Title.Substring(0,150) ;
            char[] Delimeters = new char[] { ' ', '+', ',' , '.' }; //ignore . in title
            string[] words = searchWords.Split(Delimeters);
            // Ensure that title contains one of the search words
            Boolean flag= false;
            for(i= 0; i < words.Length; i++)
            {   if (words[i].Length > 1)
                { flag = flag || (Title.IndexOf(words[i], StringComparison.InvariantCultureIgnoreCase) != -1); }
            }
             
            if (flag) return Title;
            else return ""; 
        }
      

    int CountWords(string htmlData, string searchWords)
    {   int count = 0;
        char[] Delimeters = new char[] { ' ', '+', ',' };
        string[] words = searchWords.Split(Delimeters);
        for (int k = 0; k < words.Length; k++)
        {   string word = words[k];
            int pos = 0;
           
            while (true)
            {   int i = htmlData.IndexOf(word, pos, StringComparison.InvariantCultureIgnoreCase);
                if (i == -1) break;
                count++;
                pos = i + word.Length;
            }
        }
        return count;
    }
     

    void logText(string txt)
    {   if (logFile != null)
        { logFile.WriteLine(txt); logFile.Flush(); } 
    }

     
    string  FetchURL(string url)
    { // Fetches the html for a given url. It uses .NET WebRquest class.
        try
        {   // Create a request for the URL  
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
             CookieContainer cc = (CookieContainer) Session["cc"];

            // Uncomment next line for logging
           // logFile = new StreamWriter(Server.MapPath("") + @"\log.txt");

             if (cc == null)
             {  cc = new CookieContainer();
                 Session["cc"]=cc; 
             }

          request.CookieContainer = cc;
          request.Timeout = 5000; // 5 seconds
               
           request.KeepAlive = false;
            request.ProtocolVersion = HttpVersion.Version10;   

            //  WebProxy myProxy = new WebProxy("proxy.....");
            // myProxy.Credentials = CredentialCache.DefaultCredentials;
            // request.Proxy = myProxy; 
            request.AllowAutoRedirect = true;

             request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1)";

            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
        //   if (response.Headers["Location"].Length > 0)
             //   if (response.Headers["Location"] != url) Response.Write("new location:" + response.Headers["Location"]);
             
           int StatusCode = (int)response.StatusCode;
             
           if (StatusCode != 200) return "Error:" + response.StatusDescription;
            if (response.ContentLength == 0) return "Error";
            if (!response.ContentType.StartsWith("text/htm")) return "Error";

            Stream dataStream = response.GetResponseStream();
            StreamReader reader = new StreamReader(dataStream);

            string responseFromServer = reader.ReadToEnd();
            reader.Close();
            dataStream.Close();
            response.Close();
         //   request.Close();
            return responseFromServer;
        }

 //  catch (WebException ex)
      //  {
      //   using (var sr = new StreamReader(ex.Response.GetResponseStream()))
       //  {     Response.Write(ex.Message );
       //   return ("Error: " +sr.ReadToEnd());
       //   }
  //  }

      catch (Exception ex)
      { //Response.Write(ex.Message ); 
        return ("Error: " + ex.Message); }
         
  }
   
}
