Overview

Classes

  • Syllable
  • Syllable_Cache_FileAbstract
  • Syllable_Cache_Json
  • Syllable_Cache_Serialized
  • Syllable_Hyphen_Dash
  • Syllable_Hyphen_Entity
  • Syllable_Hyphen_Soft
  • Syllable_Hyphen_Text
  • Syllable_Hyphen_ZeroWidthSpace
  • Syllable_Source_File

Interfaces

  • Syllable_Cache_Interface
  • Syllable_Hyphen_Interface
  • Syllable_Source_Interface

Functions

  • Syllable_autoloader
  • Overview
  • Class
  1: <?php
  2: 
  3:     /**
  4:      * Main class
  5:      */
  6:     class Syllable {
  7:         /**
  8:          * Version string, used to recalculate language caches if needed.
  9:          */
 10:         const CACHE_VERSION         = 1.4;
 11: 
 12:         /**
 13:          * @deprecated since version 1.2
 14:          */
 15:         const TRESHOLD_LEAST        = PHP_INT_MAX;
 16:         /**
 17:          * @deprecated since version 1.2
 18:          */
 19:         const TRESHOLD_AVERAGE      = PHP_INT_MAX;
 20:         /**
 21:          * @deprecated since version 1.2
 22:          */
 23:         const TRESHOLD_MOST         = PHP_INT_MAX;
 24: 
 25:         /**
 26:          * @var Syllable_Cache_Interface
 27:          */
 28:         private $Cache;
 29: 
 30:         /**
 31:          * @var Syllable_Cache_Interface
 32:          */
 33:         private $Source;
 34: 
 35:         /**
 36:          * @var Syllable_Hyphen_Interface
 37:          */
 38:         private $Hyphen;
 39: 
 40:         private $language;
 41:         
 42:         private $left_min_hyphen    = 2;
 43:         private $right_min_hyphen   = 2;
 44:         private $patterns           = null;
 45:         private $max_pattern        = null;
 46:         private $hyphenation        = null;
 47:         
 48:         private static $cache_dir       = null;
 49:         private static $language_dir    = null;
 50: 
 51:         public function __construct($language = 'en', $hyphen = null) {
 52:             if (!self::$cache_dir) {
 53:                 self::$cache_dir = __DIR__.'/cache';
 54:             }
 55:             $this->setCache(new Syllable_Cache_Json(self::$cache_dir));             
 56:             
 57:             if (!self::$language_dir) {
 58:                 self::$language_dir = __DIR__.'/languages';
 59:             }
 60:                         
 61:             $this->setLanguage($language);
 62:             
 63:             if ($hyphen === self::TRESHOLD_MOST) {          
 64:                 $hyphen = func_get_arg(2);
 65:             }
 66:             
 67:             $this->setHyphen($hyphen? $hyphen : new Syllable_Hyphen_Soft());
 68:         }
 69: 
 70:         public static function setCacheDir($dir) {
 71:             self::$cache_dir = $dir;
 72:         }
 73:         
 74:         public static function setLanguageDir($dir) {
 75:             self::$language_dir = $dir;
 76:         }
 77:         
 78:         public function setLanguage($language) {
 79:             $this->language = $language;        
 80:             $this->setSource(new Syllable_Source_File($language, self::$language_dir));
 81:         }
 82: 
 83:         /**
 84:          * Set the hyphen to use when hyphenating text
 85:          * @param Mixed $hyphen either a Syllable_Hyphen_Interface or a string, which is turned into a Syllable_Hyphen_Text
 86:          */
 87:         public function setHyphen($hyphen) {
 88:             $this->Hyphen   = ($hyphen instanceof Syllable_Hyphen_Interface)
 89:                             ? $hyphen
 90:                             : new Syllable_Hyphen_Text($hyphen);
 91:         }
 92: 
 93:         /**
 94:          *
 95:          * @return Syllable_Hyphen_Interface hyphen
 96:          */
 97:         public function getHyphen() {
 98:             return $this->Hyphen;
 99:         }
100: 
101:         /**
102:          * Set the treshold.
103:          * This feature is deprecated as it was based on misinterpretation of
104:          * the algorithm.
105:          * @param type $treshold
106:          * @deprecated since version 1.2
107:          */
108:         public function setTreshold($treshold = self::TRESHOLD_MOST) {
109:             trigger_error('Treshold removed', E_USER_DEPRECATED);
110:         }
111: 
112:         /**
113:          * Get the treshold.
114:          * This feature is deprecated as it was based on misinterpretation of
115:          * the algorithm.
116:          * @return int
117:          * @deprecated since version 1.2
118:          */
119:         public function getTreshold() {
120:             trigger_error('Treshold removed', E_USER_DEPRECATED);
121:             return self::TRESHOLD_MOST;
122:         }
123: 
124:         /**
125:          *
126:          * @param Syllable_Cache_Interface $Cache
127:          */
128:         public function setCache(Syllable_Cache_Interface $Cache = null) {
129:             $this->Cache = $Cache;
130:         }
131: 
132:         /**
133:          * @return Syllable_Cache_Interface
134:          */
135:         public function getCache() {
136:             return $this->Cache;
137:         }
138: 
139:         public function setSource(Syllable_Source_Interface $Source) {
140:             $this->Source = $Source;
141:         }
142: 
143:         /**
144:          * @return Syllable_Source_Interface
145:          */
146:         public function getSource() {
147:             return $this->Source;
148:         }
149: 
150:         public function splitWord($word) {
151:             mb_internal_encoding('UTF-8');  //@todo upwards?
152:             mb_regex_encoding('UTF-8'); //@todo upwards?
153: 
154:             $this->loadLanguage();
155:             
156:             return $this->parseWord($word);
157:         }
158: 
159:         public function splitText($text) {
160:             mb_internal_encoding('UTF-8');  //@todo upwards?
161:             mb_regex_encoding('UTF-8'); //@todo upwards?
162: 
163:             $this->loadLanguage();
164: 
165:             $splits = mb_split('[^\'[:alpha:]]+', $text);
166:             $parts = array();
167:             $part = '';
168:             $pos = 0;
169: 
170:             foreach ($splits as $split) {
171:                 if (mb_strlen($split)) {
172:                     $p = mb_stripos($text, $split, $pos);
173: 
174:                     $length = $p - $pos;
175:                     if ($length >= 1) {
176:                         $part .= mb_substr($text, $pos, $length);
177:                     }
178:                     if (!empty($split)) {
179:                         $sw = $this->parseWord($split);
180:                         $index = 0;
181:                         $part .= $sw[$index++];
182:                         $sw_count = count($sw);
183:                         if ($sw_count > 1) {
184:                             do {
185:                                 $parts[] = $part;
186:                                 $part = $sw[$index++];
187:                             } while ($index < $sw_count);
188:                         }
189:                     }
190:                     $pos = $p + mb_strlen($split);
191:                 }
192:             }
193:             $parts[] = $part . mb_substr($text, $pos);
194: 
195:             return $parts;
196:         }
197: 
198:         public function hyphenateWord($word) {
199:             $parts = $this->splitWord($word);
200:             return $this->Hyphen->joinText($parts);
201:         }
202: 
203:         public function hyphenateText($text) {
204:             $parts = $this->splitText($text);
205:             return $this->Hyphen->joinText($parts);
206:         }
207: 
208:         public function hyphenateHtml($html) {
209:             $dom = new DOMDocument();
210:             $dom->resolveExternals = true;
211:             $dom->loadHTML($html);
212: 
213:             $this->hyphenateHtmlDom($dom);
214: 
215:             return $dom->saveHTML();
216:         }
217: 
218:         private function hyphenateHtmlDom(DOMNode $node) {
219:             if ($node->hasChildNodes()) {
220:                 foreach ($node->childNodes as $child) {
221:                     $this->hyphenateHtmlDom($child);
222:                 }
223:             }
224:             if ($node instanceof DOMText) {
225:                 $parts = $this->splitText($node->data);
226: 
227:                 $this->Hyphen->joinHtmlDom($parts, $node);
228:             }
229:         }
230:         
231:         public function histogramText($text) {
232:             mb_internal_encoding('UTF-8');  //@todo upwards?
233:             mb_regex_encoding('UTF-8'); //@todo upwards?
234:             
235:             $this->loadLanguage();          
236:             
237:             $counts = array();
238:             foreach (mb_split('[^\'[:alpha:]]+', $text) as $split) {
239:                 if (mb_strlen($split)) {
240:                     $count = count($this->parseWord($split));
241:                     if (isset($counts[$count])) {
242:                         ++$counts[$count];
243:                     } else {
244:                         $counts[$count] = 1;
245:                     }
246:                 }
247:             }
248:             
249:             return $counts;
250:         }
251:         
252:         public function countWordsText($text) {
253:             mb_internal_encoding('UTF-8');  //@todo upwards?
254:             mb_regex_encoding('UTF-8'); //@todo upwards?
255:             
256:             $this->loadLanguage();          
257:             
258:             $count = 0;
259:             foreach (mb_split('[^\'[:alpha:]]+', $text) as $split) {
260:                 if (mb_strlen($split)) {
261:                     ++$count;
262:                 }
263:             }
264:             
265:             return $count;
266:         }
267:         
268:         public function countPolysyllablesText($text) {
269:             mb_internal_encoding('UTF-8');  //@todo upwards?
270:             mb_regex_encoding('UTF-8'); //@todo upwards?
271:             
272:             $this->loadLanguage();          
273:             
274:             $count = 0;
275:             foreach (mb_split('[^\'[:alpha:]]+', $text) as $split) {
276:                 if (mb_strlen($split) && count($this->parseWord($split)) >= 3) {
277:                     ++$count;
278:                 }
279:             }
280:             
281:             return $count;
282:         }
283: 
284:         private function loadLanguage() {
285:             $loaded = false;
286:             
287:             $cache = $this->getCache();
288:             if ($cache !== null) {
289:                 $cache->open($this->language);
290: 
291:                 if (isset($cache->version) && $cache->version == self::CACHE_VERSION
292:                  && isset($cache->patterns)
293:                  && isset($cache->max_pattern)
294:                  && isset($cache->hyphenation)
295:                  && isset($cache->left_min_hyphen)
296:                  && isset($cache->right_min_hyphen)) {
297:                     $this->patterns         = $cache->patterns;
298:                     $this->max_pattern      = $cache->max_pattern;
299:                     $this->hyphenation      = $cache->hyphenation;
300:                     $this->left_min_hyphen  = $cache->left_min_hyphen;
301:                     $this->right_min_hyphen = $cache->right_min_hyphen;
302:                     
303:                     $loaded = true;
304:                  }
305:             }
306:             
307:             if (!$loaded) {
308:                 $source = $this->getSource();
309:                 $this->patterns         = $source->getPatterns();
310:                 $this->max_pattern      = $source->getMaxPattern();
311:                 $this->hyphenation      = $source->getHyphentations();
312: 
313:                 $this->left_min_hyphen  = 2;
314:                 $this->right_min_hyphen = 2;
315:                 $minHyphens = $source->getMinHyphens();
316:                 if ($minHyphens) {
317:                     $this->left_min_hyphen  = $minHyphens[0];
318:                     $this->right_min_hyphen = $minHyphens[1];
319:                 }
320:                 
321:                 if ($cache !== null) {
322:                     $cache->version             = self::CACHE_VERSION;
323:                     $cache->patterns            = $this->patterns;
324:                     $cache->max_pattern         = $this->max_pattern;
325:                     $cache->hyphenation         = $this->hyphenation;
326:                     $cache->left_min_hyphen     = $this->left_min_hyphen;
327:                     $cache->right_min_hyphen    = $this->right_min_hyphen;
328: 
329:                     $cache->close();
330:                 }
331: 
332:                 $loaded = true;
333:             }
334:         }
335: 
336:         /**
337:          * Splits a word into an array of syllables.
338:          * @param string $word the word to be split.
339:          * @return array array of syllables.
340:          */
341:         private function parseWord($word) {
342:             $word_length = mb_strlen($word);
343: 
344:             // Is this word smaller than the miminal length requirement?
345:             if ($word_length < $this->left_min_hyphen + $this->right_min_hyphen) {
346:                 return array($word);
347:             }
348: 
349:             // Is it a pre-hyphenated word?
350:             if (isset($this->hyphenation[$word])) {
351:                 return mb_split('-', $this->hyphenation[$word]);
352:             }
353: 
354:             // Convenience array
355:             $text           = '.'.mb_strtolower($word).'.';
356:             $text_length    = $word_length + 2;
357:             $pattern_length = $this->max_pattern < $text_length ? $this->max_pattern : $text_length;
358: 
359:             // Maximize
360:             $before     = array();
361:             $end        = $text_length - $this->right_min_hyphen;
362:             for ($start = 0; $start < $end; ++$start) {
363:                 $max_length = $start + $pattern_length;
364:                 if ($text_length - $start < $max_length) {
365:                     $max_length = $text_length - $start;
366:                 }
367:                 for ($length = 1; $length <= $max_length; ++$length) {
368:                     $subword = mb_substr($text, $start, $length);               
369:                     if (isset($this->patterns[$subword])) {
370:                         $scores = $this->patterns[$subword];
371:                         $scores_length = $length + 1;
372:                         for ($offset = 0; $offset < $scores_length; ++$offset) {
373:                             $score = $scores{$offset};
374:                             if (!isset($before[($start + $offset)]) || $score > $before[$start + $offset]) {
375:                                 $before[$start + $offset] = $score;
376:                             }
377:                         }
378:                     }
379:                 }
380:             }
381: 
382:             // Output
383:             $parts  = array();
384:             $part   = mb_substr($word, 0, $this->left_min_hyphen);
385:             for ($i = $this->left_min_hyphen + 1; $i < $end; ++$i) {
386:                 if (isset($before[$i])) {
387:                     $score  = (int)$before[$i];
388:                     if ($score & 1) {   // only odd
389:                         //$part .= $score; // debugging
390:                         $parts[] = $part;   
391:                         $part = '';
392:                     }
393:                 }
394:                 $part .= mb_substr($word, $i - 1, 1);
395:             }
396:             for (; $i < $text_length - 1; ++$i) {
397:                 $part .= mb_substr($word, $i - 1, 1);
398:             }
399:             if (!empty($part)) {
400:                 $parts[] = $part;
401:             }
402: 
403:             return $parts;
404:         }
405:     }
API documentation generated by ApiGen