diff --git a/go.mod b/go.mod index 773ad5e..469799b 100644 --- a/go.mod +++ b/go.mod @@ -6,5 +6,8 @@ require ( github.com/474420502/focus v0.8.1 github.com/474420502/gurl v0.0.2 github.com/474420502/requests v1.5.0 + github.com/Pallinder/go-randomdata v1.1.0 + github.com/lestrrat-go/libxml2 v0.0.0-20200215080510-6483566f52cb + github.com/pkg/errors v0.9.1 // indirect github.com/tebeka/selenium v0.9.9 ) diff --git a/go.sum b/go.sum index bca4755..630bde3 100644 --- a/go.sum +++ b/go.sum @@ -15,6 +15,7 @@ github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802 h1:1BDTz0u9nC3//pOC github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/BurntSushi/xgbutil v0.0.0-20160919175755-f7c97cef3b4e h1:4ZrkT/RzpnROylmoQL57iVUL57wGKTR5O6KpVnbm2tA= github.com/BurntSushi/xgbutil v0.0.0-20160919175755-f7c97cef3b4e/go.mod h1:uw9h2sd4WWHOPdJ13MQpwK5qYWKYDumDqxWWIknEQ+k= +github.com/Pallinder/go-randomdata v1.1.0 h1:gUubB1IEUliFmzjqjhf+bgkg1o6uoFIkRsP3VrhEcx8= github.com/Pallinder/go-randomdata v1.1.0/go.mod h1:yHmJgulpD2Nfrm0cR9tI/+oAgRqCQQixsA8HyRZfV9Y= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= @@ -50,6 +51,10 @@ github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5m github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= +github.com/lestrrat-go/libxml2 v0.0.0-20200215080510-6483566f52cb h1:qqNmX9V9n4byPp7LUvUf7CPhMPYO9ol4ElpbD3DgzuY= +github.com/lestrrat-go/libxml2 v0.0.0-20200215080510-6483566f52cb/go.mod h1:fy/ZVbgyB83mtricxwSW3zqIRXWOVpKG2PvdUDFeC58= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rogpeppe/go-charset v0.0.0-20180617210344-2471d30d28b4/go.mod h1:qgYeAmZ5ZIpBWTGllZSQnw97Dj+woV0toclVaRGI8pc= diff --git a/hunter.go b/hunter.go index b831787..e7d6d97 100644 --- a/hunter.go +++ b/hunter.go @@ -1,6 +1,7 @@ package hunter import ( + "log" "strconv" pqueue "github.com/474420502/focus/priority_queue" @@ -50,13 +51,6 @@ func NewPriorityHunter(queueCreator func() *pqueue.PriorityQueue) *Hunter { hunter := &Hunter{} hunter.createQueue = queueCreator - // hunter.task = &BaseTask{} - // hunter.task.SetParent(nil) - // hunter.task.SetChildren(hunter.createQueue()) - - // hunter.cxt = NewContext() - // hunter.cxt.curNode = hunter.task - hunter.share = make(map[string]interface{}) return hunter } @@ -156,10 +150,22 @@ func (hunter *Hunter) recursionTasks(cxt *TaskContext) { } // Stop 停止任务 -func (hunter *Hunter) Stop() { +func (hunter *Hunter) close(itask ITask) { + if iclose, ok := itask.(IClose); ok { + if err := iclose.Close(); err != nil { + log.Println(err) + } + } } +// Stop 停止任务 +func (hunter *Hunter) Stop() { + for _, task := range hunter.tasks { + hunter.close(task) + } +} + // AddTask 执行任务 func (hunter *Hunter) AddTask(task ITask) { hunter.tasks = append(hunter.tasks, task) diff --git a/hunter_test.go b/hunter_test.go index 3fc4c26..e4536a9 100644 --- a/hunter_test.go +++ b/hunter_test.go @@ -7,7 +7,7 @@ import ( ) func init() { - log.Println("测试最好使用 docker run -p 80:80 kennethreitz/httpbin") + log.Println("recommend: docker run -p 80:80 kennethreitz/httpbin") } type WebGet struct { diff --git a/pre_base_driver_com.go b/pre_base_driver_com.go new file mode 100644 index 0000000..2540569 --- /dev/null +++ b/pre_base_driver_com.go @@ -0,0 +1,81 @@ +package hunter + +import ( + "log" + "net/http" + + "github.com/474420502/requests" + "github.com/tebeka/selenium" +) + +// https://github.com/tebeka/selenium + +// PreBaseDriverUrl Task的 curl bash 预处理组件 +type PreBaseDriverUrl struct { + PreUrl string + Port int + service *selenium.Service + driver selenium.WebDriver +} + +// Close 如果需要在最后执行销毁操作, 继承覆盖该方法 +func (u *PreBaseDriverUrl) Close() error { + + if u.service != nil { + // 直接退出, 所有销毁 直接忽略webdriver.Quit(). // Delete Session + if err := u.service.Stop(); err != nil { + return err + } + } + return nil +} + +// IResponse interface { +// Content() []byte +// GetStatus() string +// GetStatusCode() int +// GetHeader() http.Header +// GetCookie() []*http.Cookie + +// // 返回不同的自定义的Response, 也可以是其他定义的结构体如WebDriver +// GetResponse() interface{} +// } + +// Content 内容 +func (u *PreBaseDriverUrl) Content() []byte { + content, err := u.driver.PageSource() + if err != nil { + log.Println(err) + } + return []byte(content) +} + +// GetStatusCode 暂时为空 +func (u *PreBaseDriverUrl) GetStatusCode() int { + return 0 +} + +// GetStatus 内容 暂时为空 +func (u *PreBaseDriverUrl) GetStatus() string { + return "" +} + +// GetHeader 暂时为空 +func (u *PreBaseDriverUrl) GetHeader() http.Header { + return nil +} + +// GetCookie 暂时为空 +func (u *PreBaseDriverUrl) GetCookie() []*http.Cookie { + return nil +} + +// GetResponse 返回 webdriver +func (u *PreBaseDriverUrl) GetResponse() interface{} { + return u.driver +} + +func (u *PreBaseDriverUrl) Hunt() (requests.IResponse, error) { + err := u.driver.Get(string(u.PreUrl)) + return u, err +} diff --git a/pre_chrome_driver_com.go b/pre_chrome_driver_com.go new file mode 100644 index 0000000..53e07f8 --- /dev/null +++ b/pre_chrome_driver_com.go @@ -0,0 +1,46 @@ +package hunter + +import ( + "fmt" + "log" + + "github.com/Pallinder/go-randomdata" + "github.com/tebeka/selenium" +) + +// PreChromeUrl Chrome的url预处理 +type PreChromeUrl struct { + PreBaseDriverUrl +} + +// Before 驱动的预处理 +func (u *PreChromeUrl) Before(ctx *TaskContext) { + + var err error + var service *selenium.Service + + if u.service == nil { + for i := 0; i < 20; i++ { + if u.Port == 0 { + u.Port = randomdata.Number(10000, 50000) + } + service, err = selenium.NewChromeDriverService("chromedriver", u.Port) + if err != nil { + log.Println(i, err) + } else { + break + } + } + + u.service = service + } + + if u.driver == nil { + caps := selenium.Capabilities{"browserName": "chrome"} + wd, err := selenium.NewRemote(caps, fmt.Sprintf("http://localhost:%d/wd/hub", u.Port)) + if err != nil { + panic(err) + } + u.driver = wd + } +} diff --git a/pre_driver_com.go b/pre_driver_com.go deleted file mode 100644 index 967094e..0000000 --- a/pre_driver_com.go +++ /dev/null @@ -1,61 +0,0 @@ -package hunter - -import ( - "fmt" - "log" - "runtime" - - "github.com/474420502/requests" - "github.com/tebeka/selenium" -) - -// https://github.com/tebeka/selenium - -// PreDriverUrl Task的 curl bash 预处理组件 -type PreDriverUrl struct { - url string - service *selenium.Service - driver selenium.WebDriver -} - -func (u *PreDriverUrl) Before(ctx *TaskContext) { - service, err := selenium.NewChromeDriverService("chromedriver", 1030) - if err != nil { - log.Panic(err) - } - if err != nil { - panic(err) // panic is used only as an example and is not otherwise recommended. - } - u.service = service - - caps := selenium.Capabilities{"browserName": "chrome"} - wd, err := selenium.NewRemote(caps, fmt.Sprintf("http://localhost:%d/wd/hub", 1030)) - if err != nil { - panic(err) - } - u.driver = wd - - runtime.SetFinalizer(&[]interface{}{service, wd}, func(obj interface{}) { - iobj := obj.([]interface{}) - service := iobj[0].(*selenium.Service) - service.Stop() - - wd := iobj[1].(selenium.WebDriver) - wd.Quit() - }) - - err = wd.Get(string(u.url)) - if err != nil { - panic(err) - } - - ele, err := wd.FindElement(selenium.ByXPATH, "//title") - log.Println(ele.Text()) - log.Println(ele.TagName()) -} - -func (u *PreDriverUrl) Hunt() (requests.IResponse, error) { - err := u.driver.Get(string(u.url)) - - return nil, err -} diff --git a/pre_driver_com_test.go b/pre_driver_com_test.go index 6b04112..37096be 100644 --- a/pre_driver_com_test.go +++ b/pre_driver_com_test.go @@ -1,9 +1,15 @@ package hunter -import "testing" +import ( + "log" + "testing" + + "github.com/lestrrat-go/libxml2" + "github.com/tebeka/selenium" +) type WebPreDriverUrl struct { - PreDriverUrl + PreChromeUrl } func (web *WebPreDriverUrl) Execute(cxt *TaskContext) { @@ -12,9 +18,51 @@ func (web *WebPreDriverUrl) Execute(cxt *TaskContext) { panic(err) } cxt.SetShare("test", resp.Content()) + wd := resp.GetResponse().(selenium.WebDriver) + ele, err := wd.FindElement(selenium.ByXPATH, "//title") + if err != nil { + log.Panic(err) + } + title, err := ele.GetAttribute("text") + if err != nil { + log.Panic(err) + } + cxt.SetShare("driver-title", title) } func TestDriver(t *testing.T) { - hunter := NewHunter(&WebPreDriverUrl{PreDriverUrl("http://httpbin.org")}) // first params PreCurlUrl + preurl := &WebPreDriverUrl{} + preurl.PreUrl = "http://httpbin.org" + + hunter := NewHunter(preurl) // first params PreCurlUrl hunter.Execute() + defer hunter.Stop() + + content := hunter.GetShare("test").([]byte) + if content != nil { + doc, err := libxml2.ParseHTML(content) + if err != nil { + t.Error(err) + } else { + if result, err := doc.Find("//title"); err == nil { + iter := result.NodeIter() + if iter.Next() { + n := iter.Node() + if n.TextContent() != "httpbin.org" { + t.Error(n.TextContent()) + } + } else { + t.Error("can't xpath title") + } + } else { + t.Error(err) + } + } + + } + + title := hunter.GetShare("driver-title").(string) + if title != "httpbin.org" { + t.Error(title) + } } diff --git a/task.go b/task.go index 3dd54a3..cb139b5 100644 --- a/task.go +++ b/task.go @@ -29,6 +29,11 @@ type IIdentity interface { GetID() string } +// IClose 关闭 +type IClose interface { + Close() error +} + // ITaskNode 任务节点 type ITaskNode interface { Parent() ITaskNode