<?php

# index.php
#######################################
# Word Harvester  2010 Scott Connell
# Created: 2010/12/26 (year/month/day)
# Updated: 2012/10/04
# License: Free for personal and commercial use.
# Terms: Redistribution/republishing strictly forbidden.
# Source: http://www.scottconnell.com
#######################################

# SET VARIABLES ##########################
# Set the full path to this directory with trailing slash /.

$full_path = "c:/scott/source/word_harvester/";

# END SETTING VARIABLES ###################

$header_path = $full_path . "header.php";
$footer_path = $full_path . "footer.php";

function printForm()
{
global $domain;

	if(!isset($domain))
	{
	$domain = "http://";
	}

$action = htmlspecialchars($_SERVER["PHP_SELF"]);

print <<<ENDHTM
<form method="post" action="$action">
<p>URL <input type="text" name="www" size="30" value="$domain" /> <input type="submit" value="Submit" /></p>
</form>

ENDHTM;
}

function cmp($a, $b)
{
	if ($a == $b) { return 0; }

return ($a < $b) ? 1 : -1;
}

if(isset($_REQUEST['www']))
{
$domain = $_REQUEST['www'];

	if(!$xHost = parse_url($domain, PHP_URL_HOST))
	{
	$title = "Error: Could not parse URL";
	include_once($header_path);
	printForm();
	exit(include_once($footer_path));
	}

	if(!$fp = @fsockopen($xHost, 80, $errno, $errstr, 30))
	{
	$title = "Error: Could not connect to $xHost";
	include_once($header_path);
	printForm();
	exit(include_once($footer_path));
	}
	else
	{
	$title = "Word Harvester";
	include_once($header_path);
	printForm();

	$text = preg_replace("|<script[^>]*?>.*?</script>|si", " ", html_entity_decode(file_get_contents($domain)));
	$text = preg_replace("|<style[^>]*?>.*?</style>|siU", " ", $text);
	$text = preg_replace("/>/i", "> ", $text);
	$text = preg_replace("/[^a-z0-9]/i", " ", strip_tags($text));
	$text = preg_replace("/\s+/", " ", strtolower($text));
	$total = count(explode(" ", trim($text)));
	$words = array_count_values(explode(" ", trim($text)));
	$different = count($words);

	var_dump($words);

	uasort($words, "cmp");

	print "<p>Found $total total words, using $different different words.</p>\n";

	print "<table class=\"data\" cellspacing=\"0\">\n";
	print "<tr><td class=\"short gray\">Count</td><td class=\"gray\" colspan=\"2\">Word</td></tr>\n";

		foreach($words as $key => $val)
		{
		print "<tr><td class=\"short\">$val</td><td>$key</td></tr>\n";
		}

	print "</table>\n";
	}
}
else
{
$title = "Word Harvester";
include_once($header_path);
printForm();
}

include_once($footer_path);

?>