Extractor upgrade

This commit is contained in:
eson
2020-08-05 18:49:47 +08:00
parent 6158976986
commit 0bff7169ec
10 changed files with 180 additions and 15 deletions

View File

@@ -1,6 +1,10 @@
package main
import (
"intimate"
"time"
"github.com/474420502/extractor"
"github.com/474420502/focus/compare"
"github.com/474420502/focus/tree/heap"
@@ -8,9 +12,14 @@ import (
"testing"
"github.com/474420502/requests"
"github.com/lestrrat-go/libxml2"
)
// sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql
var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STTwitcasting))
// estore 解析存储连接实例
var estore *intimate.StoreExtractor = intimate.NewStoreExtractor()
func TestMain(t *testing.T) {
searchurl := "https://twitcasting.tv/rankingindex.php"
@@ -27,30 +36,73 @@ func TestMain(t *testing.T) {
panic(err)
}
doc, err := libxml2.ParseHTML(resp.Content())
etor := extractor.ExtractXml(resp.Content())
// doc, err := libxml2.ParseHTML(resp.Content())
// if err != nil {
// panic(err)
// }
// defer doc.Free()
result, err := etor.XPath("//*[contains(@class, 'tag')]/@href")
if err != nil {
panic(err)
}
defer doc.Free()
result, err := doc.Find("//*[contains(@class, 'tag')]/@href")
if err != nil {
panic(err)
}
defer result.Free()
// result, err := doc.Find("//*[contains(@class, 'tag')]/@href")
// if err != nil {
// panic(err)
// }
// defer result.Free()
iter := result.NodeIter()
for iter.Next() {
log.Println(iter.Node().NodeValue())
wurl := "https://twitcasting.tv" + iter.Node().NodeValue()
if ok := queuedict[wurl]; !ok {
log.Println(wurl)
sl := &intimate.StreamerList{}
sl.Platform = intimate.Ptwitcasting
sl.Url = wurl
sl.Operator = 0
sl.UpdateInterval = 120
sl.UpdateTime = time.Now()
estore.InsertStreamerList(sl)
queue.Put(wurl)
queuedict[wurl] = true
}
}
doc.Find("//div[@class='tw-search-result-row']")
// doc.Find("//div[@class='tw-search-result-row']")
xps, err := etor.XPaths("//div[@class='tw-search-result-row']")
if err != nil {
log.Println(surl, err)
continue
}
// xps.ForEachTag(SearchProfile{})
// texts, errs := xps.ForEachText(".//span[@class='username']")
// if len(errs) > 0 {
// t.Error(errs)
// }
var splist = xps.ForEachTag(SearchProfile{})
for _, isp := range splist {
sp := isp.(*SearchProfile)
sp.UserId = sp.LiveUrl[1:]
// log.Println(sp.(SearchProfile))
}
for _, isp := range splist {
log.Println(isp.(*SearchProfile))
}
log.Println("finish remain", queue.Size())
}
}
type SearchProfile struct {
UserName string `exp:".//span[@class='username']" method:"Text"`
UserId string // `exp:".//span[@class='fullname']" method:"Text"`
LiveUrl string `exp:".//div[@class='usertext']/a[@href]" method:"Attribute,href Value"`
}