hunter/extractor.go
2020-05-18 18:24:01 +08:00

184 lines
3.8 KiB
Go

package hunter
import (
"github.com/lestrrat-go/libxml2"
"github.com/lestrrat-go/libxml2/clib"
"github.com/lestrrat-go/libxml2/types"
)
// Extractor 提取器
type Extractor struct {
Content []byte
doc types.Document
}
// NewExtractor 创建提取器
func NewExtractor(content []byte) *Extractor {
doc, err := libxml2.ParseHTML(content)
if err != nil {
panic(err)
}
return &Extractor{Content: content, doc: doc}
}
// XPath 路径提取
func (etor *Extractor) XPath(exp string) (*XPath, error) {
result, err := etor.doc.Find(exp)
return &XPath{result: []types.XPathResult{result}, errorFlags: ERROR_SKIP}, err
}
type ErrorFlags int
const (
ERROR_SKIP ErrorFlags = 1
ERROR_BREAK ErrorFlags = 2
)
type XPath struct {
result []types.XPathResult
errorFlags ErrorFlags
}
func (xp *XPath) ForEachString(exp string) (sstr []string, errorlist []error) {
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
var ir []interface{}
for iter := result.NodeIter(); iter.Next(); {
ir = append(ir, iter.Node().String())
}
return ir
})
for _, i := range inames {
sstr = append(sstr, i.(string))
}
return sstr, errlist
}
func (xp *XPath) ForEachText(exp string) (texts []string, errorlist []error) {
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
var ir []interface{}
for iter := result.NodeIter(); iter.Next(); {
ir = append(ir, iter.Node().TextContent())
}
return ir
})
for _, i := range inames {
texts = append(texts, i.(string))
}
return texts, errlist
}
func (xp *XPath) ForEachType(exp string) (typelist []clib.XMLNodeType, errorlist []error) {
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
var ir []interface{}
for iter := result.NodeIter(); iter.Next(); {
ir = append(ir, iter.Node().NodeType())
}
return ir
})
for _, i := range inames {
typelist = append(typelist, i.(clib.XMLNodeType))
}
return typelist, errlist
}
func (xp *XPath) ForEachValue(exp string) (values []string, errorlist []error) {
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
var ir []interface{}
for iter := result.NodeIter(); iter.Next(); {
ir = append(ir, iter.Node().NodeValue())
}
return ir
})
for _, i := range inames {
values = append(values, i.(string))
}
return values, errlist
}
func (xp *XPath) ForEachName(exp string) (names []string, errorlist []error) {
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
var ir []interface{}
for iter := result.NodeIter(); iter.Next(); {
ir = append(ir, iter.Node().NodeName())
}
return ir
})
for _, i := range inames {
names = append(names, i.(string))
}
return names, errlist
}
func (xp *XPath) ForEachEx(exp string, do func(types.XPathResult) []interface{}) (values []interface{}, errorlist []error) {
if len(xp.result) == 0 {
return
}
for _, xpresult := range xp.result {
iter := xpresult.NodeIter()
for iter.Next() {
node := iter.Node()
result, err := node.Find(exp)
iresult := do(result)
if err != nil {
if xp.errorFlags == ERROR_SKIP {
errorlist = append(errorlist, err)
} else {
break
}
}
values = append(values, iresult...)
}
}
return
}
func (xp *XPath) ForEach(exp string) (newxpath *XPath, errorlist []error) {
if len(xp.result) == 0 {
return
}
newxpath = &XPath{errorFlags: xp.errorFlags}
for _, xpresult := range xp.result {
iter := xpresult.NodeIter()
for iter.Next() {
node := iter.Node()
result, err := node.Find(exp)
if err != nil {
if xp.errorFlags == ERROR_SKIP {
errorlist = append(errorlist, err)
} else {
break
}
}
newxpath.result = append(newxpath.result, result)
}
}
return
}