TODO: 重构XPath 的使用 shit
This commit is contained in:
@@ -1,7 +1,14 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"intimate"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/474420502/extractor"
|
||||
@@ -11,9 +18,18 @@ import (
|
||||
"log"
|
||||
"testing"
|
||||
|
||||
_ "net/http/pprof"
|
||||
|
||||
"github.com/474420502/requests"
|
||||
)
|
||||
|
||||
func Test(t *testing.T) {
|
||||
rawurl := "https://twitcasting.tv/你好"
|
||||
u, _ := url.Parse(rawurl)
|
||||
t.Error(u.EscapedPath())
|
||||
t.Error(u.String())
|
||||
}
|
||||
|
||||
// sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql
|
||||
var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STTwitcasting))
|
||||
|
||||
@@ -21,44 +37,74 @@ var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STTwi
|
||||
var estore *intimate.StoreExtractor = intimate.NewStoreExtractor()
|
||||
|
||||
func TestMain(t *testing.T) {
|
||||
f, _ := os.OpenFile("./log", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm)
|
||||
log.SetFlags(log.Llongfile | log.Ltime)
|
||||
log.SetOutput(f)
|
||||
|
||||
go func() {
|
||||
log.Println(http.ListenAndServe(":4040", nil))
|
||||
}()
|
||||
|
||||
homeurl := "https://twitcasting.tv"
|
||||
searchurl := "https://twitcasting.tv/rankingindex.php"
|
||||
queuedict := make(map[string]bool)
|
||||
queue := heap.New(compare.String)
|
||||
queue.Put(searchurl)
|
||||
queuedict[searchurl] = true
|
||||
ses := requests.NewSession()
|
||||
ses.Config().SetTimeout(15)
|
||||
|
||||
for surl, ok := queue.Pop(); ok; surl, ok = queue.Pop() {
|
||||
var surl interface{}
|
||||
var ok bool
|
||||
var debugsp *SearchProfile
|
||||
var content []byte
|
||||
|
||||
ses := requests.NewSession()
|
||||
resp, err := ses.Get(surl.(string)).Execute()
|
||||
defer func() {
|
||||
if ierr := recover(); ierr != nil {
|
||||
log.Println(surl, debugsp)
|
||||
f, _ := os.OpenFile("./error.html", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm)
|
||||
f.Write(content)
|
||||
f.Close()
|
||||
log.Panic(ierr)
|
||||
}
|
||||
}()
|
||||
|
||||
go func() {
|
||||
signalchan := make(chan os.Signal)
|
||||
signal.Notify(signalchan, syscall.SIGINT, syscall.SIGKILL, syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGSTOP)
|
||||
log.Println("accept stop command:", <-signalchan)
|
||||
f, _ := os.OpenFile("./error.html", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm)
|
||||
f.Write(content)
|
||||
f.Close()
|
||||
os.Exit(1)
|
||||
}()
|
||||
|
||||
for surl, ok = queue.Pop(); ok; surl, ok = queue.Pop() {
|
||||
u, err := url.Parse(surl.(string))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
log.Println(err)
|
||||
continue
|
||||
}
|
||||
|
||||
resp, err := ses.Get(u.String()).Execute()
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
log.Println(u.String(), surl)
|
||||
continue
|
||||
// log.Panic(err)
|
||||
}
|
||||
|
||||
content = resp.Content()
|
||||
etor := extractor.ExtractXml(resp.Content())
|
||||
|
||||
// doc, err := libxml2.ParseHTML(resp.Content())
|
||||
// if err != nil {
|
||||
// panic(err)
|
||||
// }
|
||||
// defer doc.Free()
|
||||
|
||||
result, err := etor.XPath("//*[contains(@class, 'tag')]/@href")
|
||||
result, err := etor.XPath("//p[@class='taglist']/a[contains(@class, 'tag')]/@href")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
// result, err := doc.Find("//*[contains(@class, 'tag')]/@href")
|
||||
// if err != nil {
|
||||
// panic(err)
|
||||
// }
|
||||
// defer result.Free()
|
||||
|
||||
iter := result.NodeIter()
|
||||
for iter.Next() {
|
||||
|
||||
wurl := "https://twitcasting.tv" + iter.Node().NodeValue()
|
||||
wurl := homeurl + iter.Node().NodeValue()
|
||||
if ok := queuedict[wurl]; !ok {
|
||||
log.Println(wurl)
|
||||
sl := &intimate.StreamerList{}
|
||||
@@ -67,7 +113,9 @@ func TestMain(t *testing.T) {
|
||||
sl.Operator = 0
|
||||
sl.UpdateInterval = 120
|
||||
sl.UpdateTime = time.Now()
|
||||
|
||||
estore.InsertStreamerList(sl)
|
||||
|
||||
queue.Put(wurl)
|
||||
queuedict[wurl] = true
|
||||
}
|
||||
@@ -80,21 +128,53 @@ func TestMain(t *testing.T) {
|
||||
continue
|
||||
}
|
||||
|
||||
// xps.ForEachTag(SearchProfile{})
|
||||
|
||||
// texts, errs := xps.ForEachText(".//span[@class='username']")
|
||||
// if len(errs) > 0 {
|
||||
// t.Error(errs)
|
||||
// }
|
||||
log.Println("extract tag")
|
||||
var splist = xps.ForEachTag(SearchProfile{})
|
||||
log.Println("finish extract tag")
|
||||
for _, isp := range splist {
|
||||
sp := isp.(*SearchProfile)
|
||||
if sp.LiveUrl == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
sp.UserId = sp.LiveUrl[1:]
|
||||
for i := 0; i < len(sp.TagUrl); i++ {
|
||||
wurl := homeurl + sp.TagUrl[i]
|
||||
sp.TagUrl[i] = wurl
|
||||
if ok := queuedict[wurl]; !ok {
|
||||
sl := &intimate.StreamerList{}
|
||||
sl.Platform = intimate.Ptwitcasting
|
||||
sl.Url = wurl
|
||||
sl.Operator = 0
|
||||
sl.UpdateInterval = 120
|
||||
sl.UpdateTime = time.Now()
|
||||
estore.InsertStreamerList(sl)
|
||||
|
||||
queue.Put(wurl)
|
||||
queuedict[wurl] = true
|
||||
}
|
||||
}
|
||||
// log.Println(sp.(SearchProfile))
|
||||
}
|
||||
|
||||
log.Println("find user:", len(splist))
|
||||
for _, isp := range splist {
|
||||
log.Println(isp.(*SearchProfile))
|
||||
sp := isp.(*SearchProfile)
|
||||
// log.Println(sp)
|
||||
streamer := &intimate.Streamer{}
|
||||
streamer.Platform = intimate.Ptwitcasting
|
||||
streamer.LiveUrl = sql.NullString{String: sp.LiveUrl, Valid: true}
|
||||
if btags, err := json.Marshal(sp.Tag); err != nil {
|
||||
log.Println(err)
|
||||
} else {
|
||||
streamer.Tags = btags
|
||||
}
|
||||
streamer.UpdateInterval = 120
|
||||
streamer.UpdateTime = sql.NullTime{Time: time.Now(), Valid: true}
|
||||
streamer.UserName = sql.NullString{String: sp.UserName, Valid: true}
|
||||
streamer.UserId = sp.UserId
|
||||
debugsp = sp
|
||||
estore.InsertStreamer(streamer)
|
||||
}
|
||||
|
||||
log.Println("finish remain", queue.Size())
|
||||
@@ -102,7 +182,9 @@ func TestMain(t *testing.T) {
|
||||
}
|
||||
|
||||
type SearchProfile struct {
|
||||
UserName string `exp:".//span[@class='username']" method:"Text"`
|
||||
UserId string // `exp:".//span[@class='fullname']" method:"Text"`
|
||||
LiveUrl string `exp:".//div[@class='usertext']/a[@href]" method:"Attribute,href Value"`
|
||||
UserName string `exp:".//span[@class='username']" method:"Text"`
|
||||
UserId string // `exp:".//span[@class='fullname']" method:"Text"`
|
||||
LiveUrl string `exp:".//div[@class='usertext']/a[@href]" method:"Attribute,href Value"`
|
||||
Tag []string `exp:".//a[contains(@class, 'tag tag-mini')]" method:"Text"`
|
||||
TagUrl []string `exp:".//a[contains(@class, 'tag tag-mini')]" method:"Attribute,href Value"`
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user