diff --git a/extractor.go b/extractor.go new file mode 100644 index 0000000..bbf261e --- /dev/null +++ b/extractor.go @@ -0,0 +1,183 @@ +package hunter + +import ( + "github.com/lestrrat-go/libxml2" + "github.com/lestrrat-go/libxml2/clib" + "github.com/lestrrat-go/libxml2/types" +) + +// Extractor 提取器 +type Extractor struct { + Content []byte + doc types.Document +} + +// NewExtractor 创建提取器 +func NewExtractor(content []byte) *Extractor { + doc, err := libxml2.ParseHTML(content) + if err != nil { + panic(err) + } + return &Extractor{Content: content, doc: doc} +} + +// XPath 路径提取 +func (etor *Extractor) XPath(exp string) (*XPath, error) { + result, err := etor.doc.Find(exp) + return &XPath{result: []types.XPathResult{result}, errorFlags: ERROR_SKIP}, err +} + +type ErrorFlags int + +const ( + ERROR_SKIP ErrorFlags = 1 + ERROR_BREAK ErrorFlags = 2 +) + +type XPath struct { + result []types.XPathResult + errorFlags ErrorFlags +} + +func (xp *XPath) ForEachString(exp string) (sstr []string, errorlist []error) { + + inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} { + var ir []interface{} + for iter := result.NodeIter(); iter.Next(); { + + ir = append(ir, iter.Node().String()) + } + return ir + }) + + for _, i := range inames { + sstr = append(sstr, i.(string)) + } + + return sstr, errlist +} + +func (xp *XPath) ForEachText(exp string) (texts []string, errorlist []error) { + + inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} { + var ir []interface{} + for iter := result.NodeIter(); iter.Next(); { + + ir = append(ir, iter.Node().TextContent()) + } + return ir + }) + + for _, i := range inames { + texts = append(texts, i.(string)) + } + + return texts, errlist +} + +func (xp *XPath) ForEachType(exp string) (typelist []clib.XMLNodeType, errorlist []error) { + + inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} { + var ir []interface{} + for iter := result.NodeIter(); iter.Next(); { + + ir = append(ir, iter.Node().NodeType()) + } + return ir + }) + + for _, i := range inames { + typelist = append(typelist, i.(clib.XMLNodeType)) + } + + return typelist, errlist +} + +func (xp *XPath) ForEachValue(exp string) (values []string, errorlist []error) { + + inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} { + var ir []interface{} + for iter := result.NodeIter(); iter.Next(); { + + ir = append(ir, iter.Node().NodeValue()) + } + return ir + }) + + for _, i := range inames { + values = append(values, i.(string)) + } + + return values, errlist +} + +func (xp *XPath) ForEachName(exp string) (names []string, errorlist []error) { + + inames, errlist := xp.ForEachEx(exp, func(result types.XPathResult) []interface{} { + var ir []interface{} + for iter := result.NodeIter(); iter.Next(); { + + ir = append(ir, iter.Node().NodeName()) + } + return ir + }) + + for _, i := range inames { + names = append(names, i.(string)) + } + + return names, errlist +} + +func (xp *XPath) ForEachEx(exp string, do func(types.XPathResult) []interface{}) (values []interface{}, errorlist []error) { + if len(xp.result) == 0 { + return + } + + for _, xpresult := range xp.result { + + iter := xpresult.NodeIter() + for iter.Next() { + node := iter.Node() + result, err := node.Find(exp) + iresult := do(result) + if err != nil { + if xp.errorFlags == ERROR_SKIP { + errorlist = append(errorlist, err) + } else { + break + } + } + values = append(values, iresult...) + } + } + + return +} + +func (xp *XPath) ForEach(exp string) (newxpath *XPath, errorlist []error) { + if len(xp.result) == 0 { + return + } + + newxpath = &XPath{errorFlags: xp.errorFlags} + + for _, xpresult := range xp.result { + + iter := xpresult.NodeIter() + for iter.Next() { + node := iter.Node() + result, err := node.Find(exp) + if err != nil { + if xp.errorFlags == ERROR_SKIP { + errorlist = append(errorlist, err) + } else { + break + } + } + newxpath.result = append(newxpath.result, result) + } + } + + return +} diff --git a/extractor_test.go b/extractor_test.go new file mode 100644 index 0000000..7f4b63f --- /dev/null +++ b/extractor_test.go @@ -0,0 +1,36 @@ +package hunter + +import ( + "log" + "testing" +) + +type AreaCode struct { + PreFile +} + +func (a *AreaCode) Execute(cxt *TaskContext) { + r, err := cxt.Hunt() + if err != nil { + panic(err) + } + etor := NewExtractor(r.Content()) + xp, err := etor.XPath("//div[@class='ip']") + if err != nil { + panic(err) + } + + log.Println(xp.ForEachText("./h4")) + xpli, errlist := xp.ForEach("./h4/ul//li") + if len(errlist) != 0 { + panic(err) + } + log.Println(xpli.ForEachString("./h5/text()")) +} + +func TestExtractor(t *testing.T) { + ht := NewHunter() + ht.AddTask(&AreaCode{"./testfile/area.html"}) + ht.Execute() + t.Error() +} diff --git a/pre_chrome_driver_com.go b/pre_chrome_driver_com.go index 53e07f8..5fe2d46 100644 --- a/pre_chrome_driver_com.go +++ b/pre_chrome_driver_com.go @@ -20,7 +20,7 @@ func (u *PreChromeUrl) Before(ctx *TaskContext) { var service *selenium.Service if u.service == nil { - for i := 0; i < 20; i++ { + for i := 0; i < 50; i++ { if u.Port == 0 { u.Port = randomdata.Number(10000, 50000) } diff --git a/pre_file_com.go b/pre_file_com.go new file mode 100644 index 0000000..98fa633 --- /dev/null +++ b/pre_file_com.go @@ -0,0 +1,28 @@ +package hunter + +import ( + "io/ioutil" + "os" + + "github.com/474420502/requests" +) + +// PreFile Task的 file 预处理组件 +type PreFile string + +func (u PreFile) Hunt() (requests.IResponse, error) { + + f, err := os.Open(string(u)) + if err != nil { + panic(err) + } + + data, err := ioutil.ReadAll(f) + if err != nil { + return nil, err + } + + resp := &HResponse{} + resp.Hcontent = data + return resp, err +} diff --git a/response.go b/response.go new file mode 100644 index 0000000..4586988 --- /dev/null +++ b/response.go @@ -0,0 +1,48 @@ +package hunter + +import "net/http" + +// IResponse interface { +// Content() []byte +// GetStatus() string +// GetStatusCode() int +// GetHeader() http.Header +// GetCookie() []*http.Cookie + +// // 返回不同的自定义的Response, 也可以是其他定义的结构体如WebDriver +// GetResponse() interface{} +// } + +// HResponse Empty for easy create +type HResponse struct { + Hcontent []byte + Hstatus string + Hcode int + Hheader http.Header + Hcookies []*http.Cookie + Hresponse interface{} +} + +func (resp *HResponse) Content() []byte { + return resp.Hcontent +} + +func (resp *HResponse) GetStatus() string { + return resp.Hstatus +} + +func (resp *HResponse) GetStatusCode() int { + return resp.Hcode +} + +func (resp *HResponse) GetHeader() http.Header { + return resp.Hheader +} + +func (resp *HResponse) GetCookie() []*http.Cookie { + return resp.Hcookies +} + +func (resp *HResponse) GetResponse() interface{} { + return resp.Hresponse +} diff --git a/testfile/area.html b/testfile/area.html new file mode 100644 index 0000000..7a773c2 --- /dev/null +++ b/testfile/area.html @@ -0,0 +1,5442 @@ + + + + +
+