todo: AreaCode test
This commit is contained in:
parent
a5a30391af
commit
610cf64359
183
extractor.go
Normal file
183
extractor.go
Normal file
|
@ -0,0 +1,183 @@
|
|||
package hunter
|
||||
|
||||
import (
|
||||
"github.com/lestrrat-go/libxml2"
|
||||
"github.com/lestrrat-go/libxml2/clib"
|
||||
"github.com/lestrrat-go/libxml2/types"
|
||||
)
|
||||
|
||||
// Extractor 提取器
|
||||
type Extractor struct {
|
||||
Content []byte
|
||||
doc types.Document
|
||||
}
|
||||
|
||||
// NewExtractor 创建提取器
|
||||
func NewExtractor(content []byte) *Extractor {
|
||||
doc, err := libxml2.ParseHTML(content)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return &Extractor{Content: content, doc: doc}
|
||||
}
|
||||
|
||||
// XPath 路径提取
|
||||
func (etor *Extractor) XPath(exp string) (*XPath, error) {
|
||||
result, err := etor.doc.Find(exp)
|
||||
return &XPath{result: []types.XPathResult{result}, errorFlags: ERROR_SKIP}, err
|
||||
}
|
||||
|
||||
type ErrorFlags int
|
||||
|
||||
const (
|
||||
ERROR_SKIP ErrorFlags = 1
|
||||
ERROR_BREAK ErrorFlags = 2
|
||||
)
|
||||
|
||||
type XPath struct {
|
||||
result []types.XPathResult
|
||||
errorFlags ErrorFlags
|
||||
}
|
||||
|
||||
func (xp *XPath) ForEachString(exp string) (sstr []string, errorlist []error) {
|
||||
|
||||
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
|
||||
var ir []interface{}
|
||||
for iter := result.NodeIter(); iter.Next(); {
|
||||
|
||||
ir = append(ir, iter.Node().String())
|
||||
}
|
||||
return ir
|
||||
})
|
||||
|
||||
for _, i := range inames {
|
||||
sstr = append(sstr, i.(string))
|
||||
}
|
||||
|
||||
return sstr, errlist
|
||||
}
|
||||
|
||||
func (xp *XPath) ForEachText(exp string) (texts []string, errorlist []error) {
|
||||
|
||||
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
|
||||
var ir []interface{}
|
||||
for iter := result.NodeIter(); iter.Next(); {
|
||||
|
||||
ir = append(ir, iter.Node().TextContent())
|
||||
}
|
||||
return ir
|
||||
})
|
||||
|
||||
for _, i := range inames {
|
||||
texts = append(texts, i.(string))
|
||||
}
|
||||
|
||||
return texts, errlist
|
||||
}
|
||||
|
||||
func (xp *XPath) ForEachType(exp string) (typelist []clib.XMLNodeType, errorlist []error) {
|
||||
|
||||
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
|
||||
var ir []interface{}
|
||||
for iter := result.NodeIter(); iter.Next(); {
|
||||
|
||||
ir = append(ir, iter.Node().NodeType())
|
||||
}
|
||||
return ir
|
||||
})
|
||||
|
||||
for _, i := range inames {
|
||||
typelist = append(typelist, i.(clib.XMLNodeType))
|
||||
}
|
||||
|
||||
return typelist, errlist
|
||||
}
|
||||
|
||||
func (xp *XPath) ForEachValue(exp string) (values []string, errorlist []error) {
|
||||
|
||||
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
|
||||
var ir []interface{}
|
||||
for iter := result.NodeIter(); iter.Next(); {
|
||||
|
||||
ir = append(ir, iter.Node().NodeValue())
|
||||
}
|
||||
return ir
|
||||
})
|
||||
|
||||
for _, i := range inames {
|
||||
values = append(values, i.(string))
|
||||
}
|
||||
|
||||
return values, errlist
|
||||
}
|
||||
|
||||
func (xp *XPath) ForEachName(exp string) (names []string, errorlist []error) {
|
||||
|
||||
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
|
||||
var ir []interface{}
|
||||
for iter := result.NodeIter(); iter.Next(); {
|
||||
|
||||
ir = append(ir, iter.Node().NodeName())
|
||||
}
|
||||
return ir
|
||||
})
|
||||
|
||||
for _, i := range inames {
|
||||
names = append(names, i.(string))
|
||||
}
|
||||
|
||||
return names, errlist
|
||||
}
|
||||
|
||||
func (xp *XPath) ForEachEx(exp string, do func(types.XPathResult) []interface{}) (values []interface{}, errorlist []error) {
|
||||
if len(xp.result) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
for _, xpresult := range xp.result {
|
||||
|
||||
iter := xpresult.NodeIter()
|
||||
for iter.Next() {
|
||||
node := iter.Node()
|
||||
result, err := node.Find(exp)
|
||||
iresult := do(result)
|
||||
if err != nil {
|
||||
if xp.errorFlags == ERROR_SKIP {
|
||||
errorlist = append(errorlist, err)
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
values = append(values, iresult...)
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (xp *XPath) ForEach(exp string) (newxpath *XPath, errorlist []error) {
|
||||
if len(xp.result) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
newxpath = &XPath{errorFlags: xp.errorFlags}
|
||||
|
||||
for _, xpresult := range xp.result {
|
||||
|
||||
iter := xpresult.NodeIter()
|
||||
for iter.Next() {
|
||||
node := iter.Node()
|
||||
result, err := node.Find(exp)
|
||||
if err != nil {
|
||||
if xp.errorFlags == ERROR_SKIP {
|
||||
errorlist = append(errorlist, err)
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
newxpath.result = append(newxpath.result, result)
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
36
extractor_test.go
Normal file
36
extractor_test.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
package hunter
|
||||
|
||||
import (
|
||||
"log"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type AreaCode struct {
|
||||
PreFile
|
||||
}
|
||||
|
||||
func (a *AreaCode) Execute(cxt *TaskContext) {
|
||||
r, err := cxt.Hunt()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
etor := NewExtractor(r.Content())
|
||||
xp, err := etor.XPath("//div[@class='ip']")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
log.Println(xp.ForEachText("./h4"))
|
||||
xpli, errlist := xp.ForEach("./h4/ul//li")
|
||||
if len(errlist) != 0 {
|
||||
panic(err)
|
||||
}
|
||||
log.Println(xpli.ForEachString("./h5/text()"))
|
||||
}
|
||||
|
||||
func TestExtractor(t *testing.T) {
|
||||
ht := NewHunter()
|
||||
ht.AddTask(&AreaCode{"./testfile/area.html"})
|
||||
ht.Execute()
|
||||
t.Error()
|
||||
}
|
|
@ -20,7 +20,7 @@ func (u *PreChromeUrl) Before(ctx *TaskContext) {
|
|||
var service *selenium.Service
|
||||
|
||||
if u.service == nil {
|
||||
for i := 0; i < 20; i++ {
|
||||
for i := 0; i < 50; i++ {
|
||||
if u.Port == 0 {
|
||||
u.Port = randomdata.Number(10000, 50000)
|
||||
}
|
||||
|
|
28
pre_file_com.go
Normal file
28
pre_file_com.go
Normal file
|
@ -0,0 +1,28 @@
|
|||
package hunter
|
||||
|
||||
import (
|
||||
"io/ioutil"
|
||||
"os"
|
||||
|
||||
"github.com/474420502/requests"
|
||||
)
|
||||
|
||||
// PreFile Task的 file 预处理组件
|
||||
type PreFile string
|
||||
|
||||
func (u PreFile) Hunt() (requests.IResponse, error) {
|
||||
|
||||
f, err := os.Open(string(u))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
data, err := ioutil.ReadAll(f)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp := &HResponse{}
|
||||
resp.Hcontent = data
|
||||
return resp, err
|
||||
}
|
48
response.go
Normal file
48
response.go
Normal file
|
@ -0,0 +1,48 @@
|
|||
package hunter
|
||||
|
||||
import "net/http"
|
||||
|
||||
// IResponse interface {
|
||||
// Content() []byte
|
||||
// GetStatus() string
|
||||
// GetStatusCode() int
|
||||
// GetHeader() http.Header
|
||||
// GetCookie() []*http.Cookie
|
||||
|
||||
// // 返回不同的自定义的Response, 也可以是其他定义的结构体如WebDriver
|
||||
// GetResponse() interface{}
|
||||
// }
|
||||
|
||||
// HResponse Empty for easy create
|
||||
type HResponse struct {
|
||||
Hcontent []byte
|
||||
Hstatus string
|
||||
Hcode int
|
||||
Hheader http.Header
|
||||
Hcookies []*http.Cookie
|
||||
Hresponse interface{}
|
||||
}
|
||||
|
||||
func (resp *HResponse) Content() []byte {
|
||||
return resp.Hcontent
|
||||
}
|
||||
|
||||
func (resp *HResponse) GetStatus() string {
|
||||
return resp.Hstatus
|
||||
}
|
||||
|
||||
func (resp *HResponse) GetStatusCode() int {
|
||||
return resp.Hcode
|
||||
}
|
||||
|
||||
func (resp *HResponse) GetHeader() http.Header {
|
||||
return resp.Hheader
|
||||
}
|
||||
|
||||
func (resp *HResponse) GetCookie() []*http.Cookie {
|
||||
return resp.Hcookies
|
||||
}
|
||||
|
||||
func (resp *HResponse) GetResponse() interface{} {
|
||||
return resp.Hresponse
|
||||
}
|
5442
testfile/area.html
Normal file
5442
testfile/area.html
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user