From 23fa32b4aed7de40df4c3c1f2e7ddac97a480d85 Mon Sep 17 00:00:00 2001 From: eson Date: Fri, 7 Aug 2020 18:10:22 +0800 Subject: [PATCH] =?UTF-8?q?TODO:=20=E9=87=8D=E6=9E=84XPath=20=E7=9A=84?= =?UTF-8?q?=E4=BD=BF=E7=94=A8=20shit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.yaml | 4 +- go.mod | 3 +- go.sum | 41 ++++- store.go | 2 +- .../twitcasting_task1/main_test.go | 140 ++++++++++++++---- 5 files changed, 153 insertions(+), 37 deletions(-) diff --git a/config.yaml b/config.yaml index 1e031f8..bf7f89d 100644 --- a/config.yaml +++ b/config.yaml @@ -1,3 +1,3 @@ database: - source_uri: "root:@tcp(127.0.0.1:4000)/intimate_source?parseTime=true&loc=Local" - extractor_uri: "root:@tcp(127.0.0.1:4000)/intimate_extractor?parseTime=true&loc=Local" \ No newline at end of file + source_uri: "root:@tcp(127.0.0.1:4000)/intimate_source?parseTime=true&loc=Local&charset=utf8mb4&collation=utf8mb4_unicode_ci" + extractor_uri: "root:@tcp(127.0.0.1:4000)/intimate_extractor?parseTime=true&loc=Local&charset=utf8mb4&collation=utf8mb4_unicode_ci" \ No newline at end of file diff --git a/go.mod b/go.mod index 0841fb6..406d3f9 100644 --- a/go.mod +++ b/go.mod @@ -3,13 +3,14 @@ module intimate go 1.14 require ( - github.com/474420502/extractor v0.2.2 + github.com/474420502/extractor v0.4.1 github.com/474420502/focus v0.12.0 github.com/474420502/gcurl v0.1.2 github.com/474420502/hunter v0.3.4 github.com/474420502/requests v1.6.0 github.com/go-sql-driver/mysql v1.5.0 github.com/lestrrat-go/libxml2 v0.0.0-20200215080510-6483566f52cb + github.com/stretchr/testify v1.6.1 // indirect github.com/tebeka/selenium v0.9.9 github.com/tidwall/gjson v1.6.0 github.com/tidwall/pretty v1.0.1 // indirect diff --git a/go.sum b/go.sum index 61ec03c..119a3b6 100644 --- a/go.sum +++ b/go.sum @@ -2,16 +2,44 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= cloud.google.com/go v0.41.0/go.mod h1:OauMR7DV8fzvZIl2qg6rkaIhD/vmgk4iwEw/h6ercmg= -github.com/474420502/extractor v0.2.2 h1:hGao2iZt5CEI8oqYjQW938osQdHKgNWL/bwRJQNgHTM= -github.com/474420502/extractor v0.2.2/go.mod h1:OVFijdKLDghigpIYISHzlognL5q8eeVenT2fRhCyFns= +github.com/474420502/extractor v0.3.0 h1:VURhjNFP2kG6DvPZfsRR/3JLYHURvsHazp/JazNYbME= +github.com/474420502/extractor v0.3.0/go.mod h1:thq0UAm30cMLY6+LJHPNRSw/H3ZrMGfmK0rk+HwycvE= +github.com/474420502/extractor v0.3.1 h1:IxOeJziOR3DPrZJhOcbOUzAc/UABmKUYGLdVgxSi9yk= +github.com/474420502/extractor v0.3.1/go.mod h1:thq0UAm30cMLY6+LJHPNRSw/H3ZrMGfmK0rk+HwycvE= +github.com/474420502/extractor v0.3.2 h1:KcgRC0+pNfK803uZjL76pgsfsnlKSMR1nQX6o6y8cVA= +github.com/474420502/extractor v0.3.2/go.mod h1:yQRtpUOeb37tMitCsenURnN2Yas9Jm/5HGFDCO+/20k= +github.com/474420502/extractor v0.3.3 h1:2/rCOEtTVkezGqz7E0D8KKN1QBKlQaihe+UMxNZcwNk= +github.com/474420502/extractor v0.3.3/go.mod h1:8cakB/mW3No6o2I7PtrVHQ35auIgHh0mGIfk1++UZm4= +github.com/474420502/extractor v0.3.4 h1:3lKV5oke46sDAxkiY4KGMeBiYI8hwNkiAa2Sf8B+xPY= +github.com/474420502/extractor v0.3.4/go.mod h1:+biDin5eKLuJQHNbW+HnPYCC+2LL090iCZNxQklB11Y= +github.com/474420502/extractor v0.3.5 h1:uq3SuPY51F1pYvAtnaJtcqtJ+yE7wcaq3LP9DWTtBnQ= +github.com/474420502/extractor v0.3.5/go.mod h1:pKjqYQCZquakvor/d9JJQYrTYInWKaVXjzAg+IM1/tY= +github.com/474420502/extractor v0.3.6 h1:Qsky2YYUCENz3BFzlFOOWykFyDOfigbkkCTnMAkKExE= +github.com/474420502/extractor v0.3.6/go.mod h1:rH+/kx0CS8xpzOBqraisQE1A9vfXAPZZ+091D8HYXvw= +github.com/474420502/extractor v0.3.7 h1:QDBd4mAjf6D+vH98LQ1SJByDTtLago9GDiEvN1oyDJ0= +github.com/474420502/extractor v0.3.7/go.mod h1:v0TAfUw1zNyFCYVqj5xyFVFpoqmqErvAd2SzMzR/yc8= +github.com/474420502/extractor v0.4.0 h1:h6MbrkCBPQ2/+VRAK741oVcZuDhZ2t4USt0MOIf/v2U= +github.com/474420502/extractor v0.4.0/go.mod h1:1oPuXIm7whY+/rU7hxDW3ick4hHc4AdiNqdk5vVWaXs= +github.com/474420502/extractor v0.4.1 h1:WqcwF7gyvGREBrXBAm3fLR7yqxP/P/arq/iHXZvt8Gg= +github.com/474420502/extractor v0.4.1/go.mod h1:1oPuXIm7whY+/rU7hxDW3ick4hHc4AdiNqdk5vVWaXs= github.com/474420502/focus v0.12.0 h1:+icbmj7IEOefvTegHt5EpcHt6WFbe2miIrceUJx2Evo= github.com/474420502/focus v0.12.0/go.mod h1:d0PMjtMxFz1a9HIhwyFPkWa+JF+0LgOrEUfd8iZka6s= github.com/474420502/gcurl v0.1.2 h1:ON9Yz3IgAdtDlFlHfkAJ3aIEBDxH0RiViPE5ST5ohKg= github.com/474420502/gcurl v0.1.2/go.mod h1:hws5q/Ao64bXLLDnldz9VyTQUndTWc/i5DzdEazFfoM= github.com/474420502/hunter v0.3.4 h1:fyLAgI84jWe3IcqsISC53j1w3CXI1FERxX//Potns0M= github.com/474420502/hunter v0.3.4/go.mod h1:pe4Xr/I+2agvq339vS/OZV+EiHAWtpXQs75rioSW9oA= -github.com/474420502/libxml2 v0.0.0-20200803084225-29e441d26406 h1:nLvl2D2y+hxCglLnRmLqwRGwmUsXQt8ga46zGySTU1I= -github.com/474420502/libxml2 v0.0.0-20200803084225-29e441d26406/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= +github.com/474420502/libxml2 v0.0.0-20200806111302-aa4be92ad592 h1:kgvx2MvoMhkrzLVjM6C6RIcshgI80fnq5/LqAnTOMxQ= +github.com/474420502/libxml2 v0.0.0-20200806111302-aa4be92ad592/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= +github.com/474420502/libxml2 v0.0.0-20200807033034-1b43ad443d1d h1:MQduBAgnOCeGVUU+tawJxQLP1/Bgnn7119hGpVb9VFI= +github.com/474420502/libxml2 v0.0.0-20200807033034-1b43ad443d1d/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= +github.com/474420502/libxml2 v0.0.0-20200807033649-9731e0a44bf0 h1:EiO+pSoFk7TTv/TnVFCT/swjWQEeLAZ2wXeXsS+9+kY= +github.com/474420502/libxml2 v0.0.0-20200807033649-9731e0a44bf0/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= +github.com/474420502/libxml2 v0.0.0-20200807034854-eaa2a69a2790 h1:vzHGXv0e7MX+MSZcz4SjRJUfzoUpX96Qf0f48T6dkxk= +github.com/474420502/libxml2 v0.0.0-20200807034854-eaa2a69a2790/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= +github.com/474420502/libxml2 v0.0.0-20200807035356-cd2e51185f4b h1:q9qSCx9gm7gS6Xr2nmKqkiu2FApQJFkqvTsrAzcWXps= +github.com/474420502/libxml2 v0.0.0-20200807035356-cd2e51185f4b/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= +github.com/474420502/libxml2 v0.0.0-20200807040518-4ef6186ae68c h1:UZriMoPoXEA4Mq/yP+36sxwkOC3Jk3nqy2I7e3ZV470= +github.com/474420502/libxml2 v0.0.0-20200807040518-4ef6186ae68c/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= github.com/474420502/requests v1.6.0 h1:f4h4j40eT0P5whhg9LdkotD8CaKjtuDu/vz9iSUkCgY= github.com/474420502/requests v1.6.0/go.mod h1:SLXrQ5dL9c7dkIeKNUCBAjOIt3J9KFCS2RQjWJecNwo= github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= @@ -24,6 +52,9 @@ github.com/Pallinder/go-randomdata v1.1.0 h1:gUubB1IEUliFmzjqjhf+bgkg1o6uoFIkRsP github.com/Pallinder/go-randomdata v1.1.0/go.mod h1:yHmJgulpD2Nfrm0cR9tI/+oAgRqCQQixsA8HyRZfV9Y= github.com/Pallinder/go-randomdata v1.2.0 h1:DZ41wBchNRb/0GfsePLiSwb0PHZmT67XY00lCDlaYPg= github.com/Pallinder/go-randomdata v1.2.0/go.mod h1:yHmJgulpD2Nfrm0cR9tI/+oAgRqCQQixsA8HyRZfV9Y= +github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0= +github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= +github.com/antchfx/xpath v1.1.10/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/blang/semver v3.5.1+incompatible h1:cQNTCjp13qL8KC3Nbxr/y2Bqb63oX6wdnnjpJbkM4JQ= @@ -40,6 +71,7 @@ github.com/go-sql-driver/mysql v1.5.0 h1:ozyZYNQW3x3HtqT1jira07DN2PArx2v7/mN66gG github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b h1:VKtxabqXZkF25pY9ekfRL6a582T4P37/31XEstQ5p58= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y= @@ -109,6 +141,7 @@ golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e h1:3G+cUijn7XD+S4eJFddp53Pv7+slrESplyjG25HgL+k= golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200707034311-ab3426394381 h1:VXak5I6aEWmAXeQjA+QSZzlgNrpq9mjcfDemuexIKsU= golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= diff --git a/store.go b/store.go index 0f34036..e83718f 100644 --- a/store.go +++ b/store.go @@ -346,7 +346,7 @@ func (store *StoreExtractor) InsertStreamer(streamer IGet) (isExists bool) { return true } - _, err = tx.Exec("INSERT INTO "+StreamerTable+"(platform, user_id, update_url, update_time) VALUES(?,?,?,?);", streamer.Get("Platform"), streamer.Get("UserId"), streamer.Get("UpdateUrl"), time.Now().Add(-time.Minute*60)) + _, err = tx.Exec("INSERT INTO "+StreamerTable+"(platform, user_id, update_url, tags, update_time) VALUES(?,?,?,?,?);", streamer.Get("Platform"), streamer.Get("UserId"), streamer.Get("UpdateUrl"), streamer.Get("Tags"), time.Now().Add(-time.Minute*60)) if err != nil { panic(err) } diff --git a/tasks/twitcasting/twitcasting_task1/main_test.go b/tasks/twitcasting/twitcasting_task1/main_test.go index 2408667..7acc76b 100644 --- a/tasks/twitcasting/twitcasting_task1/main_test.go +++ b/tasks/twitcasting/twitcasting_task1/main_test.go @@ -1,7 +1,14 @@ package main import ( + "database/sql" + "encoding/json" "intimate" + "net/http" + "net/url" + "os" + "os/signal" + "syscall" "time" "github.com/474420502/extractor" @@ -11,9 +18,18 @@ import ( "log" "testing" + _ "net/http/pprof" + "github.com/474420502/requests" ) +func Test(t *testing.T) { + rawurl := "https://twitcasting.tv/你好" + u, _ := url.Parse(rawurl) + t.Error(u.EscapedPath()) + t.Error(u.String()) +} + // sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STTwitcasting)) @@ -21,44 +37,74 @@ var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STTwi var estore *intimate.StoreExtractor = intimate.NewStoreExtractor() func TestMain(t *testing.T) { + f, _ := os.OpenFile("./log", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm) + log.SetFlags(log.Llongfile | log.Ltime) + log.SetOutput(f) + go func() { + log.Println(http.ListenAndServe(":4040", nil)) + }() + + homeurl := "https://twitcasting.tv" searchurl := "https://twitcasting.tv/rankingindex.php" queuedict := make(map[string]bool) queue := heap.New(compare.String) queue.Put(searchurl) queuedict[searchurl] = true + ses := requests.NewSession() + ses.Config().SetTimeout(15) - for surl, ok := queue.Pop(); ok; surl, ok = queue.Pop() { + var surl interface{} + var ok bool + var debugsp *SearchProfile + var content []byte - ses := requests.NewSession() - resp, err := ses.Get(surl.(string)).Execute() + defer func() { + if ierr := recover(); ierr != nil { + log.Println(surl, debugsp) + f, _ := os.OpenFile("./error.html", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm) + f.Write(content) + f.Close() + log.Panic(ierr) + } + }() + + go func() { + signalchan := make(chan os.Signal) + signal.Notify(signalchan, syscall.SIGINT, syscall.SIGKILL, syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGSTOP) + log.Println("accept stop command:", <-signalchan) + f, _ := os.OpenFile("./error.html", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm) + f.Write(content) + f.Close() + os.Exit(1) + }() + + for surl, ok = queue.Pop(); ok; surl, ok = queue.Pop() { + u, err := url.Parse(surl.(string)) if err != nil { - panic(err) + log.Println(err) + continue } + resp, err := ses.Get(u.String()).Execute() + if err != nil { + log.Println(err) + log.Println(u.String(), surl) + continue + // log.Panic(err) + } + + content = resp.Content() etor := extractor.ExtractXml(resp.Content()) - - // doc, err := libxml2.ParseHTML(resp.Content()) - // if err != nil { - // panic(err) - // } - // defer doc.Free() - - result, err := etor.XPath("//*[contains(@class, 'tag')]/@href") + result, err := etor.XPath("//p[@class='taglist']/a[contains(@class, 'tag')]/@href") if err != nil { panic(err) } - // result, err := doc.Find("//*[contains(@class, 'tag')]/@href") - // if err != nil { - // panic(err) - // } - // defer result.Free() - iter := result.NodeIter() for iter.Next() { - wurl := "https://twitcasting.tv" + iter.Node().NodeValue() + wurl := homeurl + iter.Node().NodeValue() if ok := queuedict[wurl]; !ok { log.Println(wurl) sl := &intimate.StreamerList{} @@ -67,7 +113,9 @@ func TestMain(t *testing.T) { sl.Operator = 0 sl.UpdateInterval = 120 sl.UpdateTime = time.Now() + estore.InsertStreamerList(sl) + queue.Put(wurl) queuedict[wurl] = true } @@ -80,21 +128,53 @@ func TestMain(t *testing.T) { continue } - // xps.ForEachTag(SearchProfile{}) - - // texts, errs := xps.ForEachText(".//span[@class='username']") - // if len(errs) > 0 { - // t.Error(errs) - // } + log.Println("extract tag") var splist = xps.ForEachTag(SearchProfile{}) + log.Println("finish extract tag") for _, isp := range splist { sp := isp.(*SearchProfile) + if sp.LiveUrl == "" { + continue + } + sp.UserId = sp.LiveUrl[1:] + for i := 0; i < len(sp.TagUrl); i++ { + wurl := homeurl + sp.TagUrl[i] + sp.TagUrl[i] = wurl + if ok := queuedict[wurl]; !ok { + sl := &intimate.StreamerList{} + sl.Platform = intimate.Ptwitcasting + sl.Url = wurl + sl.Operator = 0 + sl.UpdateInterval = 120 + sl.UpdateTime = time.Now() + estore.InsertStreamerList(sl) + + queue.Put(wurl) + queuedict[wurl] = true + } + } // log.Println(sp.(SearchProfile)) } + log.Println("find user:", len(splist)) for _, isp := range splist { - log.Println(isp.(*SearchProfile)) + sp := isp.(*SearchProfile) + // log.Println(sp) + streamer := &intimate.Streamer{} + streamer.Platform = intimate.Ptwitcasting + streamer.LiveUrl = sql.NullString{String: sp.LiveUrl, Valid: true} + if btags, err := json.Marshal(sp.Tag); err != nil { + log.Println(err) + } else { + streamer.Tags = btags + } + streamer.UpdateInterval = 120 + streamer.UpdateTime = sql.NullTime{Time: time.Now(), Valid: true} + streamer.UserName = sql.NullString{String: sp.UserName, Valid: true} + streamer.UserId = sp.UserId + debugsp = sp + estore.InsertStreamer(streamer) } log.Println("finish remain", queue.Size()) @@ -102,7 +182,9 @@ func TestMain(t *testing.T) { } type SearchProfile struct { - UserName string `exp:".//span[@class='username']" method:"Text"` - UserId string // `exp:".//span[@class='fullname']" method:"Text"` - LiveUrl string `exp:".//div[@class='usertext']/a[@href]" method:"Attribute,href Value"` + UserName string `exp:".//span[@class='username']" method:"Text"` + UserId string // `exp:".//span[@class='fullname']" method:"Text"` + LiveUrl string `exp:".//div[@class='usertext']/a[@href]" method:"Attribute,href Value"` + Tag []string `exp:".//a[contains(@class, 'tag tag-mini')]" method:"Text"` + TagUrl []string `exp:".//a[contains(@class, 'tag tag-mini')]" method:"Attribute,href Value"` }