184 lines
3.8 KiB
Go
184 lines
3.8 KiB
Go
package hunter
|
|
|
|
import (
|
|
"github.com/lestrrat-go/libxml2"
|
|
"github.com/lestrrat-go/libxml2/clib"
|
|
"github.com/lestrrat-go/libxml2/types"
|
|
)
|
|
|
|
// Extractor 提取器
|
|
type Extractor struct {
|
|
Content []byte
|
|
doc types.Document
|
|
}
|
|
|
|
// NewExtractor 创建提取器
|
|
func NewExtractor(content []byte) *Extractor {
|
|
doc, err := libxml2.ParseHTML(content)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
return &Extractor{Content: content, doc: doc}
|
|
}
|
|
|
|
// XPath 路径提取
|
|
func (etor *Extractor) XPath(exp string) (*XPath, error) {
|
|
result, err := etor.doc.Find(exp)
|
|
return &XPath{result: []types.XPathResult{result}, errorFlags: ERROR_SKIP}, err
|
|
}
|
|
|
|
type ErrorFlags int
|
|
|
|
const (
|
|
ERROR_SKIP ErrorFlags = 1
|
|
ERROR_BREAK ErrorFlags = 2
|
|
)
|
|
|
|
type XPath struct {
|
|
result []types.XPathResult
|
|
errorFlags ErrorFlags
|
|
}
|
|
|
|
func (xp *XPath) ForEachString(exp string) (sstr []string, errorlist []error) {
|
|
|
|
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
|
|
var ir []interface{}
|
|
for iter := result.NodeIter(); iter.Next(); {
|
|
|
|
ir = append(ir, iter.Node().String())
|
|
}
|
|
return ir
|
|
})
|
|
|
|
for _, i := range inames {
|
|
sstr = append(sstr, i.(string))
|
|
}
|
|
|
|
return sstr, errlist
|
|
}
|
|
|
|
func (xp *XPath) ForEachText(exp string) (texts []string, errorlist []error) {
|
|
|
|
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
|
|
var ir []interface{}
|
|
for iter := result.NodeIter(); iter.Next(); {
|
|
|
|
ir = append(ir, iter.Node().TextContent())
|
|
}
|
|
return ir
|
|
})
|
|
|
|
for _, i := range inames {
|
|
texts = append(texts, i.(string))
|
|
}
|
|
|
|
return texts, errlist
|
|
}
|
|
|
|
func (xp *XPath) ForEachType(exp string) (typelist []clib.XMLNodeType, errorlist []error) {
|
|
|
|
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
|
|
var ir []interface{}
|
|
for iter := result.NodeIter(); iter.Next(); {
|
|
|
|
ir = append(ir, iter.Node().NodeType())
|
|
}
|
|
return ir
|
|
})
|
|
|
|
for _, i := range inames {
|
|
typelist = append(typelist, i.(clib.XMLNodeType))
|
|
}
|
|
|
|
return typelist, errlist
|
|
}
|
|
|
|
func (xp *XPath) ForEachValue(exp string) (values []string, errorlist []error) {
|
|
|
|
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
|
|
var ir []interface{}
|
|
for iter := result.NodeIter(); iter.Next(); {
|
|
|
|
ir = append(ir, iter.Node().NodeValue())
|
|
}
|
|
return ir
|
|
})
|
|
|
|
for _, i := range inames {
|
|
values = append(values, i.(string))
|
|
}
|
|
|
|
return values, errlist
|
|
}
|
|
|
|
func (xp *XPath) ForEachName(exp string) (names []string, errorlist []error) {
|
|
|
|
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
|
|
var ir []interface{}
|
|
for iter := result.NodeIter(); iter.Next(); {
|
|
|
|
ir = append(ir, iter.Node().NodeName())
|
|
}
|
|
return ir
|
|
})
|
|
|
|
for _, i := range inames {
|
|
names = append(names, i.(string))
|
|
}
|
|
|
|
return names, errlist
|
|
}
|
|
|
|
func (xp *XPath) ForEachEx(exp string, do func(types.XPathResult) []interface{}) (values []interface{}, errorlist []error) {
|
|
if len(xp.result) == 0 {
|
|
return
|
|
}
|
|
|
|
for _, xpresult := range xp.result {
|
|
|
|
iter := xpresult.NodeIter()
|
|
for iter.Next() {
|
|
node := iter.Node()
|
|
result, err := node.Find(exp)
|
|
iresult := do(result)
|
|
if err != nil {
|
|
if xp.errorFlags == ERROR_SKIP {
|
|
errorlist = append(errorlist, err)
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
values = append(values, iresult...)
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
func (xp *XPath) ForEach(exp string) (newxpath *XPath, errorlist []error) {
|
|
if len(xp.result) == 0 {
|
|
return
|
|
}
|
|
|
|
newxpath = &XPath{errorFlags: xp.errorFlags}
|
|
|
|
for _, xpresult := range xp.result {
|
|
|
|
iter := xpresult.NodeIter()
|
|
for iter.Next() {
|
|
node := iter.Node()
|
|
result, err := node.Find(exp)
|
|
if err != nil {
|
|
if xp.errorFlags == ERROR_SKIP {
|
|
errorlist = append(errorlist, err)
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
newxpath.result = append(newxpath.result, result)
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|