PHP HTMLからタイトルとbodyのテキストを抽出

April 26, 2007

FullTextの登録用にHTMLからタイトルとbodyのテキスト抜き出す。XML_HTMLSax でパースして前述のテキストを抜き出すクラス HtmlIndexExtractor を作成した。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
<?php
require_once('XML/XML_HTMLSax.php');
interface IndexExtractor {
function extract($target);
}
class HtmlIndexExtractor implements IndexExtractor {
private $parser;
private $handler;
private $title;
private $content;
private $str;
private $getstr;
public function HtmlIndexExtractor() {
$this->parser=& new XML_HTMLSax();
$this->parser->set_object($this);
$this->parser->set_option('XML\_OPTION\_TRIM\_DATA\_NODES');
$this->parser->set\_element\_handler('openHandler', 'closeHandler');
$this->parser->set\_data\_handler('dataHandler');
}
public function extract($target) {
/*
$orgenc = mb\_detect\_encoding($target);
if ($orgenc != "UTF-8") {
$target = mb\_convert\_encoding($target, "UTF-8", $orgenc);
}
*/
$this->parser->parse($target);
}
public function getTitle() {
return $this->title;
}
public function getContent() {
return $this->content;
}
function openHandler(& $parser,$name,$attrs) {
$tagname = strtolower($name);
if ($tagname == 'title' || $tagname == 'body') {
$this->str = "";
$this->getstr = true;
}
}
function closeHandler(& $parser,$name) {
$tagname = strtolower($name);
if ($tagname == 'title') {
$this->title = $this->str;
$this->getstr = false;
} elseif ($tagname == 'body') {
$this->content = $this->str;
$this->getstr = false;
}
}
function dataHandler(& $parser, $data) {
if ($this->getstr) {
$this->str .= htmlspecialchars_decode($data);
}
}
function escapeHandler(& $parser,$data) {}
function piHandler(& $parser,$target,$data) {}
function jaspHandler(& $parser,$data) {}
}
?>
PHP

tilfin freelance software engineer