FullTextの登録用にHTMLからタイトルとbodyのテキスト抜き出す。XML_HTMLSax でパースして前述のテキストを抜き出すクラス HtmlIndexExtractor を作成した。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
| <?php require_once('XML/XML_HTMLSax.php'); interface IndexExtractor { function extract($target); } class HtmlIndexExtractor implements IndexExtractor { private $parser; private $handler; private $title; private $content; private $str; private $getstr; public function HtmlIndexExtractor() { $this->parser=& new XML_HTMLSax(); $this->parser->set_object($this); $this->parser->set_option('XML\_OPTION\_TRIM\_DATA\_NODES'); $this->parser->set\_element\_handler('openHandler', 'closeHandler'); $this->parser->set\_data\_handler('dataHandler'); } public function extract($target) {
$this->parser->parse($target); } public function getTitle() { return $this->title; } public function getContent() { return $this->content; } function openHandler(& $parser,$name,$attrs) { $tagname = strtolower($name); if ($tagname == 'title' || $tagname == 'body') { $this->str = ""; $this->getstr = true; } } function closeHandler(& $parser,$name) { $tagname = strtolower($name); if ($tagname == 'title') { $this->title = $this->str; $this->getstr = false; } elseif ($tagname == 'body') { $this->content = $this->str; $this->getstr = false; } } function dataHandler(& $parser, $data) { if ($this->getstr) { $this->str .= htmlspecialchars_decode($data); } } function escapeHandler(& $parser,$data) {} function piHandler(& $parser,$target,$data) {} function jaspHandler(& $parser,$data) {} } ?>
|
tilfin
freelance software engineer