todo: AreaCode test

This commit is contained in:
huangsimin 2020-05-18 18:24:01 +08:00
parent a5a30391af
commit 610cf64359
6 changed files with 5738 additions and 1 deletions

183
extractor.go Normal file
View File

@ -0,0 +1,183 @@
package hunter
import (
"github.com/lestrrat-go/libxml2"
"github.com/lestrrat-go/libxml2/clib"
"github.com/lestrrat-go/libxml2/types"
)
// Extractor 提取器
type Extractor struct {
Content []byte
doc types.Document
}
// NewExtractor 创建提取器
func NewExtractor(content []byte) *Extractor {
doc, err := libxml2.ParseHTML(content)
if err != nil {
panic(err)
}
return &Extractor{Content: content, doc: doc}
}
// XPath 路径提取
func (etor *Extractor) XPath(exp string) (*XPath, error) {
result, err := etor.doc.Find(exp)
return &XPath{result: []types.XPathResult{result}, errorFlags: ERROR_SKIP}, err
}
type ErrorFlags int
const (
ERROR_SKIP ErrorFlags = 1
ERROR_BREAK ErrorFlags = 2
)
type XPath struct {
result []types.XPathResult
errorFlags ErrorFlags
}
func (xp *XPath) ForEachString(exp string) (sstr []string, errorlist []error) {
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
var ir []interface{}
for iter := result.NodeIter(); iter.Next(); {
ir = append(ir, iter.Node().String())
}
return ir
})
for _, i := range inames {
sstr = append(sstr, i.(string))
}
return sstr, errlist
}
func (xp *XPath) ForEachText(exp string) (texts []string, errorlist []error) {
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
var ir []interface{}
for iter := result.NodeIter(); iter.Next(); {
ir = append(ir, iter.Node().TextContent())
}
return ir
})
for _, i := range inames {
texts = append(texts, i.(string))
}
return texts, errlist
}
func (xp *XPath) ForEachType(exp string) (typelist []clib.XMLNodeType, errorlist []error) {
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
var ir []interface{}
for iter := result.NodeIter(); iter.Next(); {
ir = append(ir, iter.Node().NodeType())
}
return ir
})
for _, i := range inames {
typelist = append(typelist, i.(clib.XMLNodeType))
}
return typelist, errlist
}
func (xp *XPath) ForEachValue(exp string) (values []string, errorlist []error) {
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
var ir []interface{}
for iter := result.NodeIter(); iter.Next(); {
ir = append(ir, iter.Node().NodeValue())
}
return ir
})
for _, i := range inames {
values = append(values, i.(string))
}
return values, errlist
}
func (xp *XPath) ForEachName(exp string) (names []string, errorlist []error) {
inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} {
var ir []interface{}
for iter := result.NodeIter(); iter.Next(); {
ir = append(ir, iter.Node().NodeName())
}
return ir
})
for _, i := range inames {
names = append(names, i.(string))
}
return names, errlist
}
func (xp *XPath) ForEachEx(exp string, do func(types.XPathResult) []interface{}) (values []interface{}, errorlist []error) {
if len(xp.result) == 0 {
return
}
for _, xpresult := range xp.result {
iter := xpresult.NodeIter()
for iter.Next() {
node := iter.Node()
result, err := node.Find(exp)
iresult := do(result)
if err != nil {
if xp.errorFlags == ERROR_SKIP {
errorlist = append(errorlist, err)
} else {
break
}
}
values = append(values, iresult...)
}
}
return
}
func (xp *XPath) ForEach(exp string) (newxpath *XPath, errorlist []error) {
if len(xp.result) == 0 {
return
}
newxpath = &XPath{errorFlags: xp.errorFlags}
for _, xpresult := range xp.result {
iter := xpresult.NodeIter()
for iter.Next() {
node := iter.Node()
result, err := node.Find(exp)
if err != nil {
if xp.errorFlags == ERROR_SKIP {
errorlist = append(errorlist, err)
} else {
break
}
}
newxpath.result = append(newxpath.result, result)
}
}
return
}

36
extractor_test.go Normal file
View File

@ -0,0 +1,36 @@
package hunter
import (
"log"
"testing"
)
type AreaCode struct {
PreFile
}
func (a *AreaCode) Execute(cxt *TaskContext) {
r, err := cxt.Hunt()
if err != nil {
panic(err)
}
etor := NewExtractor(r.Content())
xp, err := etor.XPath("//div[@class='ip']")
if err != nil {
panic(err)
}
log.Println(xp.ForEachText("./h4"))
xpli, errlist := xp.ForEach("./h4/ul//li")
if len(errlist) != 0 {
panic(err)
}
log.Println(xpli.ForEachString("./h5/text()"))
}
func TestExtractor(t *testing.T) {
ht := NewHunter()
ht.AddTask(&AreaCode{"./testfile/area.html"})
ht.Execute()
t.Error()
}

View File

@ -20,7 +20,7 @@ func (u *PreChromeUrl) Before(ctx *TaskContext) {
var service *selenium.Service
if u.service == nil {
for i := 0; i < 20; i++ {
for i := 0; i < 50; i++ {
if u.Port == 0 {
u.Port = randomdata.Number(10000, 50000)
}

28
pre_file_com.go Normal file
View File

@ -0,0 +1,28 @@
package hunter
import (
"io/ioutil"
"os"
"github.com/474420502/requests"
)
// PreFile Task的 file 预处理组件
type PreFile string
func (u PreFile) Hunt() (requests.IResponse, error) {
f, err := os.Open(string(u))
if err != nil {
panic(err)
}
data, err := ioutil.ReadAll(f)
if err != nil {
return nil, err
}
resp := &HResponse{}
resp.Hcontent = data
return resp, err
}

48
response.go Normal file
View File

@ -0,0 +1,48 @@
package hunter
import "net/http"
// IResponse interface {
// Content() []byte
// GetStatus() string
// GetStatusCode() int
// GetHeader() http.Header
// GetCookie() []*http.Cookie
// // 返回不同的自定义的Response, 也可以是其他定义的结构体如WebDriver
// GetResponse() interface{}
// }
// HResponse Empty for easy create
type HResponse struct {
Hcontent []byte
Hstatus string
Hcode int
Hheader http.Header
Hcookies []*http.Cookie
Hresponse interface{}
}
func (resp *HResponse) Content() []byte {
return resp.Hcontent
}
func (resp *HResponse) GetStatus() string {
return resp.Hstatus
}
func (resp *HResponse) GetStatusCode() int {
return resp.Hcode
}
func (resp *HResponse) GetHeader() http.Header {
return resp.Hheader
}
func (resp *HResponse) GetCookie() []*http.Cookie {
return resp.Hcookies
}
func (resp *HResponse) GetResponse() interface{} {
return resp.Hresponse
}

5442
testfile/area.html Normal file

File diff suppressed because it is too large Load Diff