1: <?php
2:
3: 4: 5:
6: class Syllable {
7: 8: 9:
10: const CACHE_VERSION = 1.4;
11:
12: 13: 14:
15: const TRESHOLD_LEAST = PHP_INT_MAX;
16: 17: 18:
19: const TRESHOLD_AVERAGE = PHP_INT_MAX;
20: 21: 22:
23: const TRESHOLD_MOST = PHP_INT_MAX;
24:
25: 26: 27:
28: private $Cache;
29:
30: 31: 32:
33: private $Source;
34:
35: 36: 37:
38: private $Hyphen;
39:
40: private $language;
41:
42: private $left_min_hyphen = 2;
43: private $right_min_hyphen = 2;
44: private $patterns = null;
45: private $max_pattern = null;
46: private $hyphenation = null;
47:
48: private static $cache_dir = null;
49: private static $language_dir = null;
50:
51: public function __construct($language = 'en', $hyphen = null) {
52: if (!self::$cache_dir) {
53: self::$cache_dir = __DIR__.'/cache';
54: }
55: $this->setCache(new Syllable_Cache_Json(self::$cache_dir));
56:
57: if (!self::$language_dir) {
58: self::$language_dir = __DIR__.'/languages';
59: }
60:
61: $this->setLanguage($language);
62:
63: if ($hyphen === self::TRESHOLD_MOST) {
64: $hyphen = func_get_arg(2);
65: }
66:
67: $this->setHyphen($hyphen? $hyphen : new Syllable_Hyphen_Soft());
68: }
69:
70: public static function setCacheDir($dir) {
71: self::$cache_dir = $dir;
72: }
73:
74: public static function setLanguageDir($dir) {
75: self::$language_dir = $dir;
76: }
77:
78: public function setLanguage($language) {
79: $this->language = $language;
80: $this->setSource(new Syllable_Source_File($language, self::$language_dir));
81: }
82:
83: 84: 85: 86:
87: public function setHyphen($hyphen) {
88: $this->Hyphen = ($hyphen instanceof Syllable_Hyphen_Interface)
89: ? $hyphen
90: : new Syllable_Hyphen_Text($hyphen);
91: }
92:
93: 94: 95: 96:
97: public function getHyphen() {
98: return $this->Hyphen;
99: }
100:
101: 102: 103: 104: 105: 106: 107:
108: public function setTreshold($treshold = self::TRESHOLD_MOST) {
109: trigger_error('Treshold removed', E_USER_DEPRECATED);
110: }
111:
112: 113: 114: 115: 116: 117: 118:
119: public function getTreshold() {
120: trigger_error('Treshold removed', E_USER_DEPRECATED);
121: return self::TRESHOLD_MOST;
122: }
123:
124: 125: 126: 127:
128: public function setCache(Syllable_Cache_Interface $Cache = null) {
129: $this->Cache = $Cache;
130: }
131:
132: 133: 134:
135: public function getCache() {
136: return $this->Cache;
137: }
138:
139: public function setSource(Syllable_Source_Interface $Source) {
140: $this->Source = $Source;
141: }
142:
143: 144: 145:
146: public function getSource() {
147: return $this->Source;
148: }
149:
150: public function splitWord($word) {
151: mb_internal_encoding('UTF-8');
152: mb_regex_encoding('UTF-8');
153:
154: $this->loadLanguage();
155:
156: return $this->parseWord($word);
157: }
158:
159: public function splitText($text) {
160: mb_internal_encoding('UTF-8');
161: mb_regex_encoding('UTF-8');
162:
163: $this->loadLanguage();
164:
165: $splits = mb_split('[^\'[:alpha:]]+', $text);
166: $parts = array();
167: $part = '';
168: $pos = 0;
169:
170: foreach ($splits as $split) {
171: if (mb_strlen($split)) {
172: $p = mb_stripos($text, $split, $pos);
173:
174: $length = $p - $pos;
175: if ($length >= 1) {
176: $part .= mb_substr($text, $pos, $length);
177: }
178: if (!empty($split)) {
179: $sw = $this->parseWord($split);
180: $index = 0;
181: $part .= $sw[$index++];
182: $sw_count = count($sw);
183: if ($sw_count > 1) {
184: do {
185: $parts[] = $part;
186: $part = $sw[$index++];
187: } while ($index < $sw_count);
188: }
189: }
190: $pos = $p + mb_strlen($split);
191: }
192: }
193: $parts[] = $part . mb_substr($text, $pos);
194:
195: return $parts;
196: }
197:
198: public function hyphenateWord($word) {
199: $parts = $this->splitWord($word);
200: return $this->Hyphen->joinText($parts);
201: }
202:
203: public function hyphenateText($text) {
204: $parts = $this->splitText($text);
205: return $this->Hyphen->joinText($parts);
206: }
207:
208: public function hyphenateHtml($html) {
209: $dom = new DOMDocument();
210: $dom->resolveExternals = true;
211: $dom->loadHTML($html);
212:
213: $this->hyphenateHtmlDom($dom);
214:
215: return $dom->saveHTML();
216: }
217:
218: private function hyphenateHtmlDom(DOMNode $node) {
219: if ($node->hasChildNodes()) {
220: foreach ($node->childNodes as $child) {
221: $this->hyphenateHtmlDom($child);
222: }
223: }
224: if ($node instanceof DOMText) {
225: $parts = $this->splitText($node->data);
226:
227: $this->Hyphen->joinHtmlDom($parts, $node);
228: }
229: }
230:
231: public function histogramText($text) {
232: mb_internal_encoding('UTF-8');
233: mb_regex_encoding('UTF-8');
234:
235: $this->loadLanguage();
236:
237: $counts = array();
238: foreach (mb_split('[^\'[:alpha:]]+', $text) as $split) {
239: if (mb_strlen($split)) {
240: $count = count($this->parseWord($split));
241: if (isset($counts[$count])) {
242: ++$counts[$count];
243: } else {
244: $counts[$count] = 1;
245: }
246: }
247: }
248:
249: return $counts;
250: }
251:
252: public function countWordsText($text) {
253: mb_internal_encoding('UTF-8');
254: mb_regex_encoding('UTF-8');
255:
256: $this->loadLanguage();
257:
258: $count = 0;
259: foreach (mb_split('[^\'[:alpha:]]+', $text) as $split) {
260: if (mb_strlen($split)) {
261: ++$count;
262: }
263: }
264:
265: return $count;
266: }
267:
268: public function countPolysyllablesText($text) {
269: mb_internal_encoding('UTF-8');
270: mb_regex_encoding('UTF-8');
271:
272: $this->loadLanguage();
273:
274: $count = 0;
275: foreach (mb_split('[^\'[:alpha:]]+', $text) as $split) {
276: if (mb_strlen($split) && count($this->parseWord($split)) >= 3) {
277: ++$count;
278: }
279: }
280:
281: return $count;
282: }
283:
284: private function loadLanguage() {
285: $loaded = false;
286:
287: $cache = $this->getCache();
288: if ($cache !== null) {
289: $cache->open($this->language);
290:
291: if (isset($cache->version) && $cache->version == self::CACHE_VERSION
292: && isset($cache->patterns)
293: && isset($cache->max_pattern)
294: && isset($cache->hyphenation)
295: && isset($cache->left_min_hyphen)
296: && isset($cache->right_min_hyphen)) {
297: $this->patterns = $cache->patterns;
298: $this->max_pattern = $cache->max_pattern;
299: $this->hyphenation = $cache->hyphenation;
300: $this->left_min_hyphen = $cache->left_min_hyphen;
301: $this->right_min_hyphen = $cache->right_min_hyphen;
302:
303: $loaded = true;
304: }
305: }
306:
307: if (!$loaded) {
308: $source = $this->getSource();
309: $this->patterns = $source->getPatterns();
310: $this->max_pattern = $source->getMaxPattern();
311: $this->hyphenation = $source->getHyphentations();
312:
313: $this->left_min_hyphen = 2;
314: $this->right_min_hyphen = 2;
315: $minHyphens = $source->getMinHyphens();
316: if ($minHyphens) {
317: $this->left_min_hyphen = $minHyphens[0];
318: $this->right_min_hyphen = $minHyphens[1];
319: }
320:
321: if ($cache !== null) {
322: $cache->version = self::CACHE_VERSION;
323: $cache->patterns = $this->patterns;
324: $cache->max_pattern = $this->max_pattern;
325: $cache->hyphenation = $this->hyphenation;
326: $cache->left_min_hyphen = $this->left_min_hyphen;
327: $cache->right_min_hyphen = $this->right_min_hyphen;
328:
329: $cache->close();
330: }
331:
332: $loaded = true;
333: }
334: }
335:
336: 337: 338: 339: 340:
341: private function parseWord($word) {
342: $word_length = mb_strlen($word);
343:
344:
345: if ($word_length < $this->left_min_hyphen + $this->right_min_hyphen) {
346: return array($word);
347: }
348:
349:
350: if (isset($this->hyphenation[$word])) {
351: return mb_split('-', $this->hyphenation[$word]);
352: }
353:
354:
355: $text = '.'.mb_strtolower($word).'.';
356: $text_length = $word_length + 2;
357: $pattern_length = $this->max_pattern < $text_length ? $this->max_pattern : $text_length;
358:
359:
360: $before = array();
361: $end = $text_length - $this->right_min_hyphen;
362: for ($start = 0; $start < $end; ++$start) {
363: $max_length = $start + $pattern_length;
364: if ($text_length - $start < $max_length) {
365: $max_length = $text_length - $start;
366: }
367: for ($length = 1; $length <= $max_length; ++$length) {
368: $subword = mb_substr($text, $start, $length);
369: if (isset($this->patterns[$subword])) {
370: $scores = $this->patterns[$subword];
371: $scores_length = $length + 1;
372: for ($offset = 0; $offset < $scores_length; ++$offset) {
373: $score = $scores{$offset};
374: if (!isset($before[($start + $offset)]) || $score > $before[$start + $offset]) {
375: $before[$start + $offset] = $score;
376: }
377: }
378: }
379: }
380: }
381:
382:
383: $parts = array();
384: $part = mb_substr($word, 0, $this->left_min_hyphen);
385: for ($i = $this->left_min_hyphen + 1; $i < $end; ++$i) {
386: if (isset($before[$i])) {
387: $score = (int)$before[$i];
388: if ($score & 1) {
389:
390: $parts[] = $part;
391: $part = '';
392: }
393: }
394: $part .= mb_substr($word, $i - 1, 1);
395: }
396: for (; $i < $text_length - 1; ++$i) {
397: $part .= mb_substr($word, $i - 1, 1);
398: }
399: if (!empty($part)) {
400: $parts[] = $part;
401: }
402:
403: return $parts;
404: }
405: }