TODO: 重构XPath 的使用 shit

This commit is contained in:
eson 2020-08-07 18:10:22 +08:00
parent 0bff7169ec
commit 23fa32b4ae
5 changed files with 153 additions and 37 deletions

View File

@ -1,3 +1,3 @@
database:
source_uri: "root:@tcp(127.0.0.1:4000)/intimate_source?parseTime=true&loc=Local"
extractor_uri: "root:@tcp(127.0.0.1:4000)/intimate_extractor?parseTime=true&loc=Local"
source_uri: "root:@tcp(127.0.0.1:4000)/intimate_source?parseTime=true&loc=Local&charset=utf8mb4&collation=utf8mb4_unicode_ci"
extractor_uri: "root:@tcp(127.0.0.1:4000)/intimate_extractor?parseTime=true&loc=Local&charset=utf8mb4&collation=utf8mb4_unicode_ci"

3
go.mod
View File

@ -3,13 +3,14 @@ module intimate
go 1.14
require (
github.com/474420502/extractor v0.2.2
github.com/474420502/extractor v0.4.1
github.com/474420502/focus v0.12.0
github.com/474420502/gcurl v0.1.2
github.com/474420502/hunter v0.3.4
github.com/474420502/requests v1.6.0
github.com/go-sql-driver/mysql v1.5.0
github.com/lestrrat-go/libxml2 v0.0.0-20200215080510-6483566f52cb
github.com/stretchr/testify v1.6.1 // indirect
github.com/tebeka/selenium v0.9.9
github.com/tidwall/gjson v1.6.0
github.com/tidwall/pretty v1.0.1 // indirect

41
go.sum
View File

@ -2,16 +2,44 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT
cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU=
cloud.google.com/go v0.41.0/go.mod h1:OauMR7DV8fzvZIl2qg6rkaIhD/vmgk4iwEw/h6ercmg=
github.com/474420502/extractor v0.2.2 h1:hGao2iZt5CEI8oqYjQW938osQdHKgNWL/bwRJQNgHTM=
github.com/474420502/extractor v0.2.2/go.mod h1:OVFijdKLDghigpIYISHzlognL5q8eeVenT2fRhCyFns=
github.com/474420502/extractor v0.3.0 h1:VURhjNFP2kG6DvPZfsRR/3JLYHURvsHazp/JazNYbME=
github.com/474420502/extractor v0.3.0/go.mod h1:thq0UAm30cMLY6+LJHPNRSw/H3ZrMGfmK0rk+HwycvE=
github.com/474420502/extractor v0.3.1 h1:IxOeJziOR3DPrZJhOcbOUzAc/UABmKUYGLdVgxSi9yk=
github.com/474420502/extractor v0.3.1/go.mod h1:thq0UAm30cMLY6+LJHPNRSw/H3ZrMGfmK0rk+HwycvE=
github.com/474420502/extractor v0.3.2 h1:KcgRC0+pNfK803uZjL76pgsfsnlKSMR1nQX6o6y8cVA=
github.com/474420502/extractor v0.3.2/go.mod h1:yQRtpUOeb37tMitCsenURnN2Yas9Jm/5HGFDCO+/20k=
github.com/474420502/extractor v0.3.3 h1:2/rCOEtTVkezGqz7E0D8KKN1QBKlQaihe+UMxNZcwNk=
github.com/474420502/extractor v0.3.3/go.mod h1:8cakB/mW3No6o2I7PtrVHQ35auIgHh0mGIfk1++UZm4=
github.com/474420502/extractor v0.3.4 h1:3lKV5oke46sDAxkiY4KGMeBiYI8hwNkiAa2Sf8B+xPY=
github.com/474420502/extractor v0.3.4/go.mod h1:+biDin5eKLuJQHNbW+HnPYCC+2LL090iCZNxQklB11Y=
github.com/474420502/extractor v0.3.5 h1:uq3SuPY51F1pYvAtnaJtcqtJ+yE7wcaq3LP9DWTtBnQ=
github.com/474420502/extractor v0.3.5/go.mod h1:pKjqYQCZquakvor/d9JJQYrTYInWKaVXjzAg+IM1/tY=
github.com/474420502/extractor v0.3.6 h1:Qsky2YYUCENz3BFzlFOOWykFyDOfigbkkCTnMAkKExE=
github.com/474420502/extractor v0.3.6/go.mod h1:rH+/kx0CS8xpzOBqraisQE1A9vfXAPZZ+091D8HYXvw=
github.com/474420502/extractor v0.3.7 h1:QDBd4mAjf6D+vH98LQ1SJByDTtLago9GDiEvN1oyDJ0=
github.com/474420502/extractor v0.3.7/go.mod h1:v0TAfUw1zNyFCYVqj5xyFVFpoqmqErvAd2SzMzR/yc8=
github.com/474420502/extractor v0.4.0 h1:h6MbrkCBPQ2/+VRAK741oVcZuDhZ2t4USt0MOIf/v2U=
github.com/474420502/extractor v0.4.0/go.mod h1:1oPuXIm7whY+/rU7hxDW3ick4hHc4AdiNqdk5vVWaXs=
github.com/474420502/extractor v0.4.1 h1:WqcwF7gyvGREBrXBAm3fLR7yqxP/P/arq/iHXZvt8Gg=
github.com/474420502/extractor v0.4.1/go.mod h1:1oPuXIm7whY+/rU7hxDW3ick4hHc4AdiNqdk5vVWaXs=
github.com/474420502/focus v0.12.0 h1:+icbmj7IEOefvTegHt5EpcHt6WFbe2miIrceUJx2Evo=
github.com/474420502/focus v0.12.0/go.mod h1:d0PMjtMxFz1a9HIhwyFPkWa+JF+0LgOrEUfd8iZka6s=
github.com/474420502/gcurl v0.1.2 h1:ON9Yz3IgAdtDlFlHfkAJ3aIEBDxH0RiViPE5ST5ohKg=
github.com/474420502/gcurl v0.1.2/go.mod h1:hws5q/Ao64bXLLDnldz9VyTQUndTWc/i5DzdEazFfoM=
github.com/474420502/hunter v0.3.4 h1:fyLAgI84jWe3IcqsISC53j1w3CXI1FERxX//Potns0M=
github.com/474420502/hunter v0.3.4/go.mod h1:pe4Xr/I+2agvq339vS/OZV+EiHAWtpXQs75rioSW9oA=
github.com/474420502/libxml2 v0.0.0-20200803084225-29e441d26406 h1:nLvl2D2y+hxCglLnRmLqwRGwmUsXQt8ga46zGySTU1I=
github.com/474420502/libxml2 v0.0.0-20200803084225-29e441d26406/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34=
github.com/474420502/libxml2 v0.0.0-20200806111302-aa4be92ad592 h1:kgvx2MvoMhkrzLVjM6C6RIcshgI80fnq5/LqAnTOMxQ=
github.com/474420502/libxml2 v0.0.0-20200806111302-aa4be92ad592/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34=
github.com/474420502/libxml2 v0.0.0-20200807033034-1b43ad443d1d h1:MQduBAgnOCeGVUU+tawJxQLP1/Bgnn7119hGpVb9VFI=
github.com/474420502/libxml2 v0.0.0-20200807033034-1b43ad443d1d/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34=
github.com/474420502/libxml2 v0.0.0-20200807033649-9731e0a44bf0 h1:EiO+pSoFk7TTv/TnVFCT/swjWQEeLAZ2wXeXsS+9+kY=
github.com/474420502/libxml2 v0.0.0-20200807033649-9731e0a44bf0/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34=
github.com/474420502/libxml2 v0.0.0-20200807034854-eaa2a69a2790 h1:vzHGXv0e7MX+MSZcz4SjRJUfzoUpX96Qf0f48T6dkxk=
github.com/474420502/libxml2 v0.0.0-20200807034854-eaa2a69a2790/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34=
github.com/474420502/libxml2 v0.0.0-20200807035356-cd2e51185f4b h1:q9qSCx9gm7gS6Xr2nmKqkiu2FApQJFkqvTsrAzcWXps=
github.com/474420502/libxml2 v0.0.0-20200807035356-cd2e51185f4b/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34=
github.com/474420502/libxml2 v0.0.0-20200807040518-4ef6186ae68c h1:UZriMoPoXEA4Mq/yP+36sxwkOC3Jk3nqy2I7e3ZV470=
github.com/474420502/libxml2 v0.0.0-20200807040518-4ef6186ae68c/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34=
github.com/474420502/requests v1.6.0 h1:f4h4j40eT0P5whhg9LdkotD8CaKjtuDu/vz9iSUkCgY=
github.com/474420502/requests v1.6.0/go.mod h1:SLXrQ5dL9c7dkIeKNUCBAjOIt3J9KFCS2RQjWJecNwo=
github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ=
@ -24,6 +52,9 @@ github.com/Pallinder/go-randomdata v1.1.0 h1:gUubB1IEUliFmzjqjhf+bgkg1o6uoFIkRsP
github.com/Pallinder/go-randomdata v1.1.0/go.mod h1:yHmJgulpD2Nfrm0cR9tI/+oAgRqCQQixsA8HyRZfV9Y=
github.com/Pallinder/go-randomdata v1.2.0 h1:DZ41wBchNRb/0GfsePLiSwb0PHZmT67XY00lCDlaYPg=
github.com/Pallinder/go-randomdata v1.2.0/go.mod h1:yHmJgulpD2Nfrm0cR9tI/+oAgRqCQQixsA8HyRZfV9Y=
github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0=
github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
github.com/antchfx/xpath v1.1.10/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
github.com/blang/semver v3.5.1+incompatible h1:cQNTCjp13qL8KC3Nbxr/y2Bqb63oX6wdnnjpJbkM4JQ=
@ -40,6 +71,7 @@ github.com/go-sql-driver/mysql v1.5.0 h1:ozyZYNQW3x3HtqT1jira07DN2PArx2v7/mN66gG
github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg=
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b h1:VKtxabqXZkF25pY9ekfRL6a582T4P37/31XEstQ5p58=
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y=
@ -109,6 +141,7 @@ golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e h1:3G+cUijn7XD+S4eJFddp53Pv7+slrESplyjG25HgL+k=
golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200707034311-ab3426394381 h1:VXak5I6aEWmAXeQjA+QSZzlgNrpq9mjcfDemuexIKsU=
golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=

View File

@ -346,7 +346,7 @@ func (store *StoreExtractor) InsertStreamer(streamer IGet) (isExists bool) {
return true
}
_, err = tx.Exec("INSERT INTO "+StreamerTable+"(platform, user_id, update_url, update_time) VALUES(?,?,?,?);", streamer.Get("Platform"), streamer.Get("UserId"), streamer.Get("UpdateUrl"), time.Now().Add(-time.Minute*60))
_, err = tx.Exec("INSERT INTO "+StreamerTable+"(platform, user_id, update_url, tags, update_time) VALUES(?,?,?,?,?);", streamer.Get("Platform"), streamer.Get("UserId"), streamer.Get("UpdateUrl"), streamer.Get("Tags"), time.Now().Add(-time.Minute*60))
if err != nil {
panic(err)
}

View File

@ -1,7 +1,14 @@
package main
import (
"database/sql"
"encoding/json"
"intimate"
"net/http"
"net/url"
"os"
"os/signal"
"syscall"
"time"
"github.com/474420502/extractor"
@ -11,9 +18,18 @@ import (
"log"
"testing"
_ "net/http/pprof"
"github.com/474420502/requests"
)
func Test(t *testing.T) {
rawurl := "https://twitcasting.tv/你好"
u, _ := url.Parse(rawurl)
t.Error(u.EscapedPath())
t.Error(u.String())
}
// sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql
var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STTwitcasting))
@ -21,44 +37,74 @@ var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STTwi
var estore *intimate.StoreExtractor = intimate.NewStoreExtractor()
func TestMain(t *testing.T) {
f, _ := os.OpenFile("./log", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm)
log.SetFlags(log.Llongfile | log.Ltime)
log.SetOutput(f)
go func() {
log.Println(http.ListenAndServe(":4040", nil))
}()
homeurl := "https://twitcasting.tv"
searchurl := "https://twitcasting.tv/rankingindex.php"
queuedict := make(map[string]bool)
queue := heap.New(compare.String)
queue.Put(searchurl)
queuedict[searchurl] = true
ses := requests.NewSession()
ses.Config().SetTimeout(15)
for surl, ok := queue.Pop(); ok; surl, ok = queue.Pop() {
var surl interface{}
var ok bool
var debugsp *SearchProfile
var content []byte
ses := requests.NewSession()
resp, err := ses.Get(surl.(string)).Execute()
defer func() {
if ierr := recover(); ierr != nil {
log.Println(surl, debugsp)
f, _ := os.OpenFile("./error.html", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm)
f.Write(content)
f.Close()
log.Panic(ierr)
}
}()
go func() {
signalchan := make(chan os.Signal)
signal.Notify(signalchan, syscall.SIGINT, syscall.SIGKILL, syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGSTOP)
log.Println("accept stop command:", <-signalchan)
f, _ := os.OpenFile("./error.html", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm)
f.Write(content)
f.Close()
os.Exit(1)
}()
for surl, ok = queue.Pop(); ok; surl, ok = queue.Pop() {
u, err := url.Parse(surl.(string))
if err != nil {
panic(err)
log.Println(err)
continue
}
resp, err := ses.Get(u.String()).Execute()
if err != nil {
log.Println(err)
log.Println(u.String(), surl)
continue
// log.Panic(err)
}
content = resp.Content()
etor := extractor.ExtractXml(resp.Content())
// doc, err := libxml2.ParseHTML(resp.Content())
// if err != nil {
// panic(err)
// }
// defer doc.Free()
result, err := etor.XPath("//*[contains(@class, 'tag')]/@href")
result, err := etor.XPath("//p[@class='taglist']/a[contains(@class, 'tag')]/@href")
if err != nil {
panic(err)
}
// result, err := doc.Find("//*[contains(@class, 'tag')]/@href")
// if err != nil {
// panic(err)
// }
// defer result.Free()
iter := result.NodeIter()
for iter.Next() {
wurl := "https://twitcasting.tv" + iter.Node().NodeValue()
wurl := homeurl + iter.Node().NodeValue()
if ok := queuedict[wurl]; !ok {
log.Println(wurl)
sl := &intimate.StreamerList{}
@ -67,7 +113,9 @@ func TestMain(t *testing.T) {
sl.Operator = 0
sl.UpdateInterval = 120
sl.UpdateTime = time.Now()
estore.InsertStreamerList(sl)
queue.Put(wurl)
queuedict[wurl] = true
}
@ -80,21 +128,53 @@ func TestMain(t *testing.T) {
continue
}
// xps.ForEachTag(SearchProfile{})
// texts, errs := xps.ForEachText(".//span[@class='username']")
// if len(errs) > 0 {
// t.Error(errs)
// }
log.Println("extract tag")
var splist = xps.ForEachTag(SearchProfile{})
log.Println("finish extract tag")
for _, isp := range splist {
sp := isp.(*SearchProfile)
if sp.LiveUrl == "" {
continue
}
sp.UserId = sp.LiveUrl[1:]
for i := 0; i < len(sp.TagUrl); i++ {
wurl := homeurl + sp.TagUrl[i]
sp.TagUrl[i] = wurl
if ok := queuedict[wurl]; !ok {
sl := &intimate.StreamerList{}
sl.Platform = intimate.Ptwitcasting
sl.Url = wurl
sl.Operator = 0
sl.UpdateInterval = 120
sl.UpdateTime = time.Now()
estore.InsertStreamerList(sl)
queue.Put(wurl)
queuedict[wurl] = true
}
}
// log.Println(sp.(SearchProfile))
}
log.Println("find user:", len(splist))
for _, isp := range splist {
log.Println(isp.(*SearchProfile))
sp := isp.(*SearchProfile)
// log.Println(sp)
streamer := &intimate.Streamer{}
streamer.Platform = intimate.Ptwitcasting
streamer.LiveUrl = sql.NullString{String: sp.LiveUrl, Valid: true}
if btags, err := json.Marshal(sp.Tag); err != nil {
log.Println(err)
} else {
streamer.Tags = btags
}
streamer.UpdateInterval = 120
streamer.UpdateTime = sql.NullTime{Time: time.Now(), Valid: true}
streamer.UserName = sql.NullString{String: sp.UserName, Valid: true}
streamer.UserId = sp.UserId
debugsp = sp
estore.InsertStreamer(streamer)
}
log.Println("finish remain", queue.Size())
@ -102,7 +182,9 @@ func TestMain(t *testing.T) {
}
type SearchProfile struct {
UserName string `exp:".//span[@class='username']" method:"Text"`
UserId string // `exp:".//span[@class='fullname']" method:"Text"`
LiveUrl string `exp:".//div[@class='usertext']/a[@href]" method:"Attribute,href Value"`
UserName string `exp:".//span[@class='username']" method:"Text"`
UserId string // `exp:".//span[@class='fullname']" method:"Text"`
LiveUrl string `exp:".//div[@class='usertext']/a[@href]" method:"Attribute,href Value"`
Tag []string `exp:".//a[contains(@class, 'tag tag-mini')]" method:"Text"`
TagUrl []string `exp:".//a[contains(@class, 'tag tag-mini')]" method:"Attribute,href Value"`
}