TODO: 完善主播表
的更新方式. 存在更新最新状态时, 会被改版
类似情况覆盖, 从而找不到原信息数据
This commit is contained in:
parent
0d8b456f41
commit
75b1d9fb77
|
@ -4,85 +4,85 @@ import (
|
||||||
"database/sql"
|
"database/sql"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"intimate"
|
"intimate"
|
||||||
|
"io/ioutil"
|
||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/474420502/hunter"
|
"github.com/474420502/hunter"
|
||||||
"github.com/474420502/requests"
|
"github.com/474420502/requests"
|
||||||
|
"github.com/lestrrat-go/libxml2"
|
||||||
"github.com/tidwall/gjson"
|
"github.com/tidwall/gjson"
|
||||||
)
|
)
|
||||||
|
|
||||||
func preNUm(data byte) int {
|
func TestCase1(t *testing.T) {
|
||||||
var mask byte = 0x80
|
date := "2020-07-13T18:58:24+09:00"
|
||||||
var num int = 0
|
|
||||||
//8bit中首个0bit前有多少个1bits
|
tm, err := time.Parse("2006-01-02T15:04:05Z07:00", date)
|
||||||
for i := 0; i < 8; i++ {
|
t.Error(err)
|
||||||
if (data & mask) == mask {
|
t.Error(time.Now())
|
||||||
num++
|
t.Error(tm.Local().UTC(), tm.Local())
|
||||||
mask = mask >> 1
|
|
||||||
} else {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return num
|
|
||||||
}
|
|
||||||
func isUtf8(data []byte) bool {
|
|
||||||
i := 0
|
|
||||||
for i < len(data) {
|
|
||||||
if (data[i] & 0x80) == 0x00 {
|
|
||||||
// 0XXX_XXXX
|
|
||||||
i++
|
|
||||||
continue
|
|
||||||
} else if num := preNUm(data[i]); num > 2 {
|
|
||||||
// 110X_XXXX 10XX_XXXX
|
|
||||||
// 1110_XXXX 10XX_XXXX 10XX_XXXX
|
|
||||||
// 1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
|
|
||||||
// 1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
|
|
||||||
// 1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
|
|
||||||
// preNUm() 返回首个字节的8个bits中首个0bit前面1bit的个数,该数量也是该字符所使用的字节数
|
|
||||||
i++
|
|
||||||
for j := 0; j < num-1; j++ {
|
|
||||||
//判断后面的 num - 1 个字节是不是都是10开头
|
|
||||||
if (data[i] & 0xc0) != 0x80 {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
i++
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
//其他情况说明不是utf-8
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func isGBK(data []byte) bool {
|
func TestCase2(t *testing.T) {
|
||||||
length := len(data)
|
duration1 := "0:00:00"
|
||||||
var i int = 0
|
duration2 := "4:56:04"
|
||||||
for i < length {
|
tm2, err := time.Parse("15:04:05", duration2)
|
||||||
if data[i] <= 0x7f {
|
tm1, err := time.Parse("15:04:05", duration1)
|
||||||
//编码0~127,只有一个字节的编码,兼容ASCII码
|
|
||||||
i++
|
tm2.Sub(tm1)
|
||||||
continue
|
|
||||||
} else {
|
t.Error(err)
|
||||||
//大于127的使用双字节编码,落在gbk编码范围内的字符
|
t.Error(tm2.Sub(tm1))
|
||||||
if data[i] >= 0x81 &&
|
|
||||||
data[i] <= 0xfe &&
|
|
||||||
data[i+1] >= 0x40 &&
|
|
||||||
data[i+1] <= 0xfe &&
|
|
||||||
data[i+1] != 0xf7 {
|
|
||||||
i += 2
|
|
||||||
continue
|
|
||||||
} else {
|
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestCase(t *testing.T) {
|
||||||
|
f, _ := os.Open("./test.html")
|
||||||
|
data, _ := ioutil.ReadAll(f)
|
||||||
|
|
||||||
|
doc, err := libxml2.ParseHTML(data)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
}
|
}
|
||||||
|
// doc.CreateElement("meta")
|
||||||
|
// "<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8">"
|
||||||
|
|
||||||
|
xresult, err := doc.Find("/html/head")
|
||||||
|
ele, err := doc.CreateElement(`META`)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
}
|
}
|
||||||
return true
|
ele.SetAttribute("charset", "utf-8")
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
iter := xresult.NodeIter()
|
||||||
|
if iter.Next() {
|
||||||
|
n := iter.Node()
|
||||||
|
|
||||||
|
err = n.AddChild(ele)
|
||||||
|
// childs, err := n.ChildNodes()
|
||||||
|
if err != nil {
|
||||||
|
t.Error(err)
|
||||||
|
}
|
||||||
|
t.Error(n)
|
||||||
|
}
|
||||||
|
|
||||||
|
xr, err := doc.Find("//h1[ contains(@class, 'MovieTitle__Title')]")
|
||||||
|
if err != nil {
|
||||||
|
panic(nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Error(xr)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestExtractor(t *testing.T) {
|
func TestExtractor(t *testing.T) {
|
||||||
|
@ -96,6 +96,15 @@ func TestExtractor(t *testing.T) {
|
||||||
collect := intimate.NewExtractorStore()
|
collect := intimate.NewExtractorStore()
|
||||||
store := intimate.NewSourceStore("source_openrec")
|
store := intimate.NewSourceStore("source_openrec")
|
||||||
source, err := store.Pop(string(intimate.TTOpenrecRanking), 100)
|
source, err := store.Pop(string(intimate.TTOpenrecRanking), 100)
|
||||||
|
|
||||||
|
anchorId := source.GetSource().String
|
||||||
|
|
||||||
|
ai := &intimate.AnchorInfo{}
|
||||||
|
ai.SetAnchorId(anchorId)
|
||||||
|
ai.SetPlatform(string(intimate.Popenrec))
|
||||||
|
|
||||||
|
collect.InsertAnchorInfo(ai)
|
||||||
|
|
||||||
// if source != nil {
|
// if source != nil {
|
||||||
// defer store.Restore(source)
|
// defer store.Restore(source)
|
||||||
// }
|
// }
|
||||||
|
@ -110,7 +119,7 @@ func TestExtractor(t *testing.T) {
|
||||||
|
|
||||||
user := m["user"]
|
user := m["user"]
|
||||||
|
|
||||||
ai := &intimate.CollectLog{}
|
clog := &intimate.CollectLog{}
|
||||||
extractor := hunter.NewExtractor([]byte(user.Str))
|
extractor := hunter.NewExtractor([]byte(user.Str))
|
||||||
xp, err := extractor.XPathResult("//p[@class='c-global__user__count__row__right js-userCountFollowers']/text()")
|
xp, err := extractor.XPathResult("//p[@class='c-global__user__count__row__right js-userCountFollowers']/text()")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -121,8 +130,8 @@ func TestExtractor(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
followers := strings.ReplaceAll(xp.String(), ",", "")
|
followers := strings.ReplaceAll(xp.String(), ",", "")
|
||||||
|
|
||||||
followersInt, err := strconv.ParseInt(followers, 10, 64)
|
followersInt, err := strconv.ParseInt(followers, 10, 64)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Error(err)
|
t.Error(err)
|
||||||
}
|
}
|
||||||
|
@ -134,6 +143,7 @@ func TestExtractor(t *testing.T) {
|
||||||
} else {
|
} else {
|
||||||
t.Error(err)
|
t.Error(err)
|
||||||
}
|
}
|
||||||
|
t.Error(source.GetSource())
|
||||||
t.Error(anchorName)
|
t.Error(anchorName)
|
||||||
|
|
||||||
// c-contents
|
// c-contents
|
||||||
|
@ -149,8 +159,8 @@ func TestExtractor(t *testing.T) {
|
||||||
t.Error(err)
|
t.Error(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
ai.SetViews(sql.NullInt64{Int64: int64(viewsint), Valid: true})
|
clog.SetViews(sql.NullInt64{Int64: int64(viewsint), Valid: true})
|
||||||
ai.SetIsShowing(1)
|
clog.SetIsShowing(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
var givers []interface{}
|
var givers []interface{}
|
||||||
|
@ -167,50 +177,69 @@ func TestExtractor(t *testing.T) {
|
||||||
giversbytes, err := json.Marshal(givers)
|
giversbytes, err := json.Marshal(givers)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Error(err)
|
t.Error(err)
|
||||||
ai.SetErrorMsg(sql.NullString{String: err.Error(), Valid: true})
|
clog.SetErrorMsg(sql.NullString{String: err.Error(), Valid: true})
|
||||||
} else {
|
} else {
|
||||||
ai.SetGiver(giversbytes)
|
clog.SetGiver(giversbytes)
|
||||||
}
|
}
|
||||||
|
|
||||||
// MovieToolbar__Views-g5e6ic-13 iDRGyA
|
// MovieToolbar__Views-g5e6ic-13 iDRGyA
|
||||||
livejson := m["user_live"]
|
livejson := m["user_live"]
|
||||||
|
|
||||||
f, err := os.OpenFile("./test.html", os.O_CREATE|os.O_TRUNC|os.O_RDWR, os.ModePerm)
|
// f, err := os.OpenFile("./test.html", os.O_CREATE|os.O_TRUNC|os.O_RDWR, os.ModePerm)
|
||||||
if err != nil {
|
// if err != nil {
|
||||||
panic(err)
|
// panic(err)
|
||||||
}
|
// }
|
||||||
f.WriteString(livejson.String())
|
// f.WriteString(livejson.String())
|
||||||
t.Error(livejson)
|
|
||||||
extractor = hunter.NewExtractor([]byte(livejson.Str))
|
|
||||||
xr, err := extractor.XPathResult("//h1[ contains(@class, 'MovieTitle__Title')]")
|
|
||||||
|
|
||||||
if err != nil {
|
extractor = hunter.NewExtractor([]byte(livejson.Str))
|
||||||
t.Error(err)
|
// xr, err := extractor.XPathResult("//h1[ contains(@class, 'MovieTitle__Title')]")
|
||||||
}
|
// if err != nil {
|
||||||
t.Error(xr)
|
// t.Error(err)
|
||||||
iter := xr.NodeIter()
|
// }
|
||||||
if iter.Next() {
|
|
||||||
t.Error(iter.Node().TextContent())
|
mathes := regexp.MustCompile("MovieTitle__Title.*>(.+)</h1>").FindStringSubmatch(livejson.Str)
|
||||||
ai.SetShowTitle(sql.NullString{String: iter.Node().TextContent(), Valid: true})
|
if len(mathes) == 2 {
|
||||||
|
|
||||||
|
clog.SetShowTitle(sql.NullString{String: mathes[1], Valid: true})
|
||||||
|
|
||||||
content, err := extractor.XPathResult("//meta[@itemprop='uploadDate']/@content")
|
content, err := extractor.XPathResult("//meta[@itemprop='uploadDate']/@content")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Error(err)
|
t.Error(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if content.NodeIter().Next() {
|
iter := content.NodeIter()
|
||||||
t.Error(content.String())
|
if iter.Next() {
|
||||||
|
tm, err := time.ParseInLocation("2006-01-02T15:04:05Z07:00", iter.Node().NodeValue(), time.Local)
|
||||||
|
if err != nil {
|
||||||
|
t.Error(err)
|
||||||
|
}
|
||||||
|
clog.SetShowStartTime(sql.NullTime{Time: tm.Local(), Valid: true})
|
||||||
|
|
||||||
|
duration, err := extractor.XPathResult("//meta[@itemprop='duration']/@content")
|
||||||
|
if err != nil {
|
||||||
|
t.Error(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
diter := duration.NodeIter()
|
||||||
|
if diter.Next() {
|
||||||
|
|
||||||
|
dt, err := intimate.ParseDuration(diter.Node().NodeValue())
|
||||||
|
if err != nil {
|
||||||
|
log.Println(err)
|
||||||
|
}
|
||||||
|
endtm := tm.Add(dt)
|
||||||
|
clog.SetShowEndTime(sql.NullTime{Time: endtm.Local(), Valid: true})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
t.Error(xr.String(), xr.NodeIter().Next(), xr.String())
|
|
||||||
|
|
||||||
ai.SetGratuity(sql.NullInt64{Int64: gratuity, Valid: true})
|
clog.SetGratuity(sql.NullInt64{Int64: gratuity, Valid: true})
|
||||||
ai.SetPlatform(string(intimate.Popenrec))
|
clog.SetPlatform(string(intimate.Popenrec))
|
||||||
ai.SetFollowers(sql.NullInt64{Int64: int64(followersInt), Valid: true})
|
clog.SetFollowers(sql.NullInt64{Int64: int64(followersInt), Valid: true})
|
||||||
ai.SetAnchorId(source.GetSource().String)
|
clog.SetAnchorId(anchorId)
|
||||||
ai.SetUpdateTime(source.GetUpdateTime())
|
clog.SetUpdateTime(source.GetUpdateTime())
|
||||||
|
|
||||||
collect.InsertCollectLog(ai)
|
collect.InsertCollectLog(clog)
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
t.Error("data is not json:\n", string(sdata))
|
t.Error("data is not json:\n", string(sdata))
|
||||||
|
|
1
go.mod
1
go.mod
|
@ -7,6 +7,7 @@ require (
|
||||||
github.com/474420502/hunter v0.3.0
|
github.com/474420502/hunter v0.3.0
|
||||||
github.com/474420502/requests v1.6.0
|
github.com/474420502/requests v1.6.0
|
||||||
github.com/go-sql-driver/mysql v1.5.0
|
github.com/go-sql-driver/mysql v1.5.0
|
||||||
|
github.com/lestrrat-go/libxml2 v0.0.0-20200215080510-6483566f52cb
|
||||||
github.com/tidwall/gjson v1.6.0
|
github.com/tidwall/gjson v1.6.0
|
||||||
github.com/tidwall/pretty v1.0.1 // indirect
|
github.com/tidwall/pretty v1.0.1 // indirect
|
||||||
golang.org/x/net v0.0.0-20200707034311-ab3426394381 // indirect
|
golang.org/x/net v0.0.0-20200707034311-ab3426394381 // indirect
|
||||||
|
|
6
store.go
6
store.go
|
@ -181,9 +181,10 @@ func NewExtractorStore() *ExtractorStore {
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// InsertAnchorInfo AnchorInfo表, 插入数据
|
// InsertAnchorInfo AnchorInfo表, 插入数据
|
||||||
func (store *ExtractorStore) InsertAnchorInfo(isource IGetAnchorInfo) {
|
func (store *ExtractorStore) InsertAnchorInfo(isource IGetAnchorInfo) error {
|
||||||
_, err := store.db.Exec("insert into "+AnchorTable+"(platform, anchor_id, anchor_name, live_url, channel, show_type, ext) values(?,?,?,?,?,?,?) ON DUPLICATE KEY UPDATE", isource.GetPlatform(), isource.GetAnchorId(), isource.GetAnchorName(), isource.GetLiveUrl(), isource.GetChannel(), isource.GetShowType(), isource.GetExt())
|
_, err := store.db.Exec("insert into "+AnchorTable+"(platform, anchor_id, anchor_name, live_url, channel, show_type, ext) values(?,?,?,?,?,?,?) ON DUPLICATE KEY UPDATE", isource.GetPlatform(), isource.GetAnchorId(), isource.GetAnchorName(), isource.GetLiveUrl(), isource.GetChannel(), isource.GetShowType(), isource.GetExt())
|
||||||
store.errorAlarm(err)
|
store.errorAlarm(err)
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -209,9 +210,10 @@ func (store *ExtractorStore) InsertAnchorInfo(isource IGetAnchorInfo) {
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// InsertCollectLog CollectLog表插入数据
|
// InsertCollectLog CollectLog表插入数据
|
||||||
func (store *ExtractorStore) InsertCollectLog(isource IGetCollectLog) {
|
func (store *ExtractorStore) InsertCollectLog(isource IGetCollectLog) error {
|
||||||
_, err := store.db.Exec("insert into "+CollectLogTable+"(uid, platform, anchor_id, is_showing, is_error, followers, views, giver, gratuity, show_title, show_start_time, show_end_time, update_time, ext, error_msg) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
|
_, err := store.db.Exec("insert into "+CollectLogTable+"(uid, platform, anchor_id, is_showing, is_error, followers, views, giver, gratuity, show_title, show_start_time, show_end_time, update_time, ext, error_msg) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||||
isource.GetUid(), isource.GetPlatform(), isource.GetAnchorId(), isource.GetIsShowing(), isource.GetIsError(), isource.GetFollowers(), isource.GetViews(), isource.GetGiver(), isource.GetGratuity(), isource.GetShowTitle(), isource.GetShowStartTime(), isource.GetShowEndTime(), isource.GetUpdateTime(), isource.GetExt(), isource.GetErrorMsg(),
|
isource.GetUid(), isource.GetPlatform(), isource.GetAnchorId(), isource.GetIsShowing(), isource.GetIsError(), isource.GetFollowers(), isource.GetViews(), isource.GetGiver(), isource.GetGratuity(), isource.GetShowTitle(), isource.GetShowStartTime(), isource.GetShowEndTime(), isource.GetUpdateTime(), isource.GetExt(), isource.GetErrorMsg(),
|
||||||
)
|
)
|
||||||
store.errorAlarm(err)
|
store.errorAlarm(err)
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,7 +35,7 @@ func (oer *OpenrecExtratorRanking) Execute(cxt *hunter.TaskContext) {
|
||||||
|
|
||||||
source, err := store.Pop(string(intimate.TTOpenrecUser))
|
source, err := store.Pop(string(intimate.TTOpenrecUser))
|
||||||
|
|
||||||
if source == nil && err != nil {
|
if source == nil || err != nil {
|
||||||
log.Println(err)
|
log.Println(err)
|
||||||
time.Sleep(time.Second * 2)
|
time.Sleep(time.Second * 2)
|
||||||
continue
|
continue
|
||||||
|
|
28
utils.go
Normal file
28
utils.go
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
package intimate
|
||||||
|
|
||||||
|
import (
|
||||||
|
"log"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
var zeroTime time.Time
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
|
||||||
|
tm, err := time.Parse("15:04:05", "0:00:00")
|
||||||
|
if err != nil {
|
||||||
|
log.Println(err)
|
||||||
|
}
|
||||||
|
zeroTime = tm
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParseDuration time to duration eg: 1:40:00 -> time.Duration
|
||||||
|
func ParseDuration(dt string) (time.Duration, error) {
|
||||||
|
tdt, err := time.Parse("15:04:05", dt)
|
||||||
|
if err != nil {
|
||||||
|
|
||||||
|
return time.Duration(0), err
|
||||||
|
}
|
||||||
|
return tdt.Sub(zeroTime), nil
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user