完成解析模块示例. 数据库入库测试.
TODO: 调整程序启动停止(非暴力关闭).
This commit is contained in:
215
extractor/openrec/openrec_extractor.go
Normal file
215
extractor/openrec/openrec_extractor.go
Normal file
@@ -0,0 +1,215 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"intimate"
|
||||
"log"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/tidwall/gjson"
|
||||
)
|
||||
|
||||
// OpenrecExtractor 提取方法
|
||||
type OpenrecExtractor struct {
|
||||
user *intimate.ExtractorSource
|
||||
userLive *intimate.ExtractorSource
|
||||
supporters *intimate.ExtractorSource
|
||||
}
|
||||
|
||||
func (oe *OpenrecExtractor) extractFollowers(clog intimate.ISet) {
|
||||
extractor := oe.user.GetExtractor()
|
||||
xp, err := extractor.XPathResult("//p[@class='c-global__user__count__row__right js-userCountFollowers']/text()")
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
if !xp.NodeIter().Next() {
|
||||
log.Println("不存在粉丝数")
|
||||
}
|
||||
|
||||
followers := strings.ReplaceAll(xp.String(), ",", "")
|
||||
followersInt, err := strconv.ParseInt(followers, 10, 64)
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
|
||||
clog.Set("Followers", sql.NullInt64{Int64: followersInt, Valid: true})
|
||||
}
|
||||
|
||||
func (oe *OpenrecExtractor) extractAnchorName(ai intimate.ISet) {
|
||||
extractor := oe.user.GetExtractor()
|
||||
xp, err := extractor.XPathResult("//p[@class='c-global__user__profile__list__name__text official-icon--after']/text()")
|
||||
if xp.NodeIter().Next() {
|
||||
anchorName := xp.String()
|
||||
ai.Set("AnchorName", anchorName)
|
||||
} else {
|
||||
log.Println(err)
|
||||
}
|
||||
}
|
||||
|
||||
func (oe *OpenrecExtractor) extractViewsAndLiveStreaming(clog intimate.ISet) {
|
||||
extractor := oe.user.GetExtractor()
|
||||
// c-contents
|
||||
xp, err := extractor.XPathResult("//ul[@class='c-contents']//p[@class='c-thumbnailVideo__footer__liveCount']/text()")
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
if xp.NodeIter().Next() {
|
||||
views := regexp.MustCompile(`[0-9,]+`).FindString(xp.String())
|
||||
views = strings.ReplaceAll(views, ",", "")
|
||||
viewsint, err := strconv.Atoi(views)
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
|
||||
clog.Set("Views", sql.NullInt64{Int64: int64(viewsint), Valid: true})
|
||||
clog.Set("IsLiveStreaming", int32(1))
|
||||
}
|
||||
}
|
||||
|
||||
func (oe *OpenrecExtractor) extractGiversAndGratuity(clog intimate.ISet) {
|
||||
// extractor := oe.user.GetExtractor()
|
||||
giverjson := oe.supporters.GetSource()
|
||||
var givers []interface{}
|
||||
var gratuity int64 = 0
|
||||
|
||||
for _, v := range giverjson.Array() {
|
||||
giverSource := gjson.Parse(v.String())
|
||||
for _, item := range giverSource.Get("data.items").Array() {
|
||||
givers = append(givers, item.Map())
|
||||
gratuity += item.Get("total_yells").Int()
|
||||
}
|
||||
}
|
||||
|
||||
giversbytes, err := json.Marshal(givers)
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
clog.Set("ErrorMsg", sql.NullString{String: err.Error(), Valid: true})
|
||||
} else {
|
||||
clog.Set("Giver", giversbytes)
|
||||
}
|
||||
|
||||
clog.Set("Gratuity", sql.NullInt64{Int64: gratuity, Valid: true})
|
||||
}
|
||||
|
||||
func (oe *OpenrecExtractor) extractLive(clog intimate.ISet) {
|
||||
extractor := oe.userLive.GetExtractor()
|
||||
mathes := regexp.MustCompile("MovieTitle__Title[^>]+>(.{1,50})</h1>").FindStringSubmatch(oe.userLive.GetSource().Str)
|
||||
if len(mathes) == 2 {
|
||||
|
||||
clog.Set("LiveTitle", sql.NullString{String: mathes[1], Valid: true})
|
||||
|
||||
content, err := extractor.XPathResult("//meta[@itemprop='uploadDate']/@content")
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
|
||||
iter := content.NodeIter()
|
||||
if iter.Next() {
|
||||
tm, err := time.ParseInLocation("2006-01-02T15:04:05Z07:00", iter.Node().NodeValue(), time.Local)
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
clog.Set("LiveStartTime", sql.NullTime{Time: tm.Local(), Valid: true})
|
||||
|
||||
duration, err := extractor.XPathResult("//meta[@itemprop='duration']/@content")
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
|
||||
diter := duration.NodeIter()
|
||||
if diter.Next() {
|
||||
|
||||
dt, err := intimate.ParseDuration(diter.Node().NodeValue())
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
endtm := tm.Add(dt)
|
||||
clog.Set("LiveEndTime", sql.NullTime{Time: endtm.Local(), Valid: true})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (oe *OpenrecExtractor) extractTags(clog intimate.ISet) {
|
||||
var tags []string
|
||||
matheslist := regexp.MustCompile(`TagButton__Button[^>]+>(.{1,100})</a`).FindAllStringSubmatch(oe.userLive.GetSource().Str, -1)
|
||||
for _, m := range matheslist {
|
||||
tags = append(tags, m[1])
|
||||
}
|
||||
log.Println(tags)
|
||||
tagsBytes, err := json.Marshal(tags)
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
|
||||
clog.Set("Tags", tagsBytes)
|
||||
}
|
||||
|
||||
func (oe *OpenrecExtractor) Execute() {
|
||||
collect := intimate.NewExtractorStore()
|
||||
store := intimate.NewSourceStore("source_openrec")
|
||||
|
||||
for {
|
||||
source, err := store.Pop(string(intimate.TTOpenrecRanking), 100)
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
return
|
||||
}
|
||||
|
||||
anchorId := source.GetSource().String
|
||||
|
||||
ai := &intimate.AnchorInfo{}
|
||||
ai.SetAnchorId(anchorId)
|
||||
ai.SetPlatform(string(intimate.Popenrec))
|
||||
|
||||
sdata := source.GetExt().([]byte)
|
||||
|
||||
if gjson.ValidBytes(sdata) {
|
||||
result := gjson.ParseBytes(sdata)
|
||||
datamap := result.Map()
|
||||
|
||||
oe := &OpenrecExtractor{}
|
||||
oe.user = intimate.NewExtractorSource(datamap["user"])
|
||||
oe.user.CreateExtractor()
|
||||
|
||||
oe.userLive = intimate.NewExtractorSource(datamap["user_live"])
|
||||
oe.userLive.CreateExtractor()
|
||||
|
||||
oe.supporters = intimate.NewExtractorSource(datamap["supporters"])
|
||||
|
||||
clog := &intimate.CollectLog{}
|
||||
|
||||
oe.extractFollowers(clog)
|
||||
oe.extractAnchorName(ai)
|
||||
oe.extractViewsAndLiveStreaming(clog)
|
||||
oe.extractGiversAndGratuity(clog)
|
||||
oe.extractLive(clog)
|
||||
oe.extractTags(clog)
|
||||
|
||||
ai.Set("UpdateTime", source.GetUpdateTime())
|
||||
|
||||
LiveUrl := "https://www.openrec.tv/live/" + anchorId
|
||||
ai.Set("LiveUrl", sql.NullString{String: LiveUrl, Valid: true})
|
||||
|
||||
Uid, err := collect.InsertAnchorInfo(ai)
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
return
|
||||
}
|
||||
|
||||
clog.Set("Uid", Uid)
|
||||
clog.Set("Platform", string(intimate.Popenrec))
|
||||
clog.Set("AnchorId", anchorId)
|
||||
clog.Set("UpdateTime", source.GetUpdateTime())
|
||||
|
||||
collect.InsertCollectLog(clog)
|
||||
} else {
|
||||
log.Println("data is not json:\n", string(sdata))
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -2,19 +2,14 @@ package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"intimate"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"os"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/474420502/hunter"
|
||||
"github.com/474420502/requests"
|
||||
"github.com/lestrrat-go/libxml2"
|
||||
"github.com/tidwall/gjson"
|
||||
)
|
||||
@@ -100,187 +95,66 @@ func TestCase(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestExtractor(t *testing.T) {
|
||||
|
||||
ses := requests.NewSession()
|
||||
tp := ses.Get("https://www.openrec.tv/user/Riowh/supporters")
|
||||
tp.Execute()
|
||||
|
||||
// t.Error(ses.GetCookies(wf.GetParsedURL()))
|
||||
|
||||
collect := intimate.NewExtractorStore()
|
||||
store := intimate.NewSourceStore("source_openrec")
|
||||
source, err := store.Pop(string(intimate.TTOpenrecRanking), 100)
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
return
|
||||
}
|
||||
|
||||
anchorId := source.GetSource().String
|
||||
|
||||
ai := &intimate.AnchorInfo{}
|
||||
ai.SetAnchorId(anchorId)
|
||||
ai.SetPlatform(string(intimate.Popenrec))
|
||||
|
||||
sdata := source.GetExt().([]byte)
|
||||
|
||||
if gjson.ValidBytes(sdata) {
|
||||
result := gjson.ParseBytes(sdata)
|
||||
m := result.Map()
|
||||
|
||||
user := m["user"]
|
||||
|
||||
clog := &intimate.CollectLog{}
|
||||
extractor := hunter.NewExtractor([]byte(user.Str))
|
||||
xp, err := extractor.XPathResult("//p[@class='c-global__user__count__row__right js-userCountFollowers']/text()")
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
if !xp.NodeIter().Next() {
|
||||
t.Error("不存在粉丝数")
|
||||
}
|
||||
|
||||
followers := strings.ReplaceAll(xp.String(), ",", "")
|
||||
followersInt, err := strconv.ParseInt(followers, 10, 64)
|
||||
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
var anchorName string
|
||||
xp, err = extractor.XPathResult("//p[@class='c-global__user__profile__list__name__text official-icon--after']/text()")
|
||||
if xp.NodeIter().Next() {
|
||||
anchorName = xp.String()
|
||||
} else {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
t.Error(source.GetSource())
|
||||
t.Error(anchorName)
|
||||
|
||||
ai.SetAnchorName(anchorName)
|
||||
|
||||
// c-contents
|
||||
xp, err = extractor.XPathResult("//ul[@class='c-contents']//p[@class='c-thumbnailVideo__footer__liveCount']/text()")
|
||||
for {
|
||||
source, err := store.Pop(string(intimate.TTOpenrecRanking), 100)
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
if xp.NodeIter().Next() {
|
||||
views := regexp.MustCompile(`[0-9,]+`).FindString(xp.String())
|
||||
views = strings.ReplaceAll(views, ",", "")
|
||||
viewsint, err := strconv.Atoi(views)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
clog.SetViews(sql.NullInt64{Int64: int64(viewsint), Valid: true})
|
||||
clog.SetIsShowing(1)
|
||||
}
|
||||
|
||||
var givers []interface{}
|
||||
var gratuity int64 = 0
|
||||
giverjson := m["supporters"]
|
||||
for _, v := range giverjson.Array() {
|
||||
giverSource := gjson.Parse(v.String())
|
||||
for _, item := range giverSource.Get("data.items").Array() {
|
||||
givers = append(givers, item.Map())
|
||||
gratuity += item.Get("total_yells").Int()
|
||||
}
|
||||
}
|
||||
|
||||
giversbytes, err := json.Marshal(givers)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
clog.SetErrorMsg(sql.NullString{String: err.Error(), Valid: true})
|
||||
} else {
|
||||
clog.SetGiver(giversbytes)
|
||||
}
|
||||
|
||||
// MovieToolbar__Views-g5e6ic-13 iDRGyA
|
||||
livejson := m["user_live"]
|
||||
|
||||
f, err := os.OpenFile("./test.html", os.O_CREATE|os.O_TRUNC|os.O_RDWR, os.ModePerm)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
f.WriteString(livejson.String())
|
||||
|
||||
extractor = hunter.NewExtractor([]byte(livejson.Str))
|
||||
// xr, err := extractor.XPathResult("//h1[ contains(@class, 'MovieTitle__Title')]")
|
||||
// if err != nil {
|
||||
// t.Error(err)
|
||||
// }
|
||||
|
||||
mathes := regexp.MustCompile("MovieTitle__Title[^>]+>(.{1,50})</h1>").FindStringSubmatch(livejson.Str)
|
||||
if len(mathes) == 2 {
|
||||
|
||||
clog.SetShowTitle(sql.NullString{String: mathes[1], Valid: true})
|
||||
|
||||
content, err := extractor.XPathResult("//meta[@itemprop='uploadDate']/@content")
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
iter := content.NodeIter()
|
||||
if iter.Next() {
|
||||
tm, err := time.ParseInLocation("2006-01-02T15:04:05Z07:00", iter.Node().NodeValue(), time.Local)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
clog.SetShowStartTime(sql.NullTime{Time: tm.Local(), Valid: true})
|
||||
|
||||
duration, err := extractor.XPathResult("//meta[@itemprop='duration']/@content")
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
diter := duration.NodeIter()
|
||||
if diter.Next() {
|
||||
|
||||
dt, err := intimate.ParseDuration(diter.Node().NodeValue())
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
endtm := tm.Add(dt)
|
||||
clog.SetShowEndTime(sql.NullTime{Time: endtm.Local(), Valid: true})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var tags []string
|
||||
matheslist := regexp.MustCompile(`TagButton__Button[^>]+>(.{1,100})</a`).FindAllStringSubmatch(livejson.Str, -1)
|
||||
for _, m := range matheslist {
|
||||
tags = append(tags, m[1])
|
||||
}
|
||||
t.Error(tags)
|
||||
tagsBytes, err := json.Marshal(tags)
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
|
||||
ai.SetTags(tagsBytes)
|
||||
ai.SetUpdateTime(source.GetUpdateTime())
|
||||
|
||||
LiveUrl := "https://www.openrec.tv/live/" + anchorId
|
||||
ai.SetLiveUrl(sql.NullString{String: LiveUrl, Valid: true})
|
||||
|
||||
Uid, err := collect.InsertAnchorInfo(ai)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
return
|
||||
}
|
||||
|
||||
clog.SetUid(Uid)
|
||||
clog.SetTags(tagsBytes)
|
||||
clog.SetGratuity(sql.NullInt64{Int64: gratuity, Valid: true})
|
||||
clog.SetPlatform(string(intimate.Popenrec))
|
||||
clog.SetFollowers(sql.NullInt64{Int64: int64(followersInt), Valid: true})
|
||||
clog.SetAnchorId(anchorId)
|
||||
clog.SetUpdateTime(source.GetUpdateTime())
|
||||
anchorId := source.GetSource().String
|
||||
|
||||
collect.InsertCollectLog(clog)
|
||||
ai := &intimate.AnchorInfo{}
|
||||
ai.SetAnchorId(anchorId)
|
||||
ai.SetPlatform(string(intimate.Popenrec))
|
||||
|
||||
} else {
|
||||
t.Error("data is not json:\n", string(sdata))
|
||||
sdata := source.GetExt().([]byte)
|
||||
|
||||
if gjson.ValidBytes(sdata) {
|
||||
result := gjson.ParseBytes(sdata)
|
||||
datamap := result.Map()
|
||||
|
||||
oe := &OpenrecExtractor{}
|
||||
oe.user = intimate.NewExtractorSource(datamap["user"])
|
||||
oe.user.CreateExtractor()
|
||||
|
||||
oe.userLive = intimate.NewExtractorSource(datamap["user_live"])
|
||||
oe.userLive.CreateExtractor()
|
||||
|
||||
oe.supporters = intimate.NewExtractorSource(datamap["supporters"])
|
||||
|
||||
clog := &intimate.CollectLog{}
|
||||
|
||||
oe.extractFollowers(clog)
|
||||
oe.extractAnchorName(ai)
|
||||
oe.extractViewsAndLiveStreaming(clog)
|
||||
oe.extractGiversAndGratuity(clog)
|
||||
oe.extractLive(clog)
|
||||
oe.extractTags(clog)
|
||||
|
||||
ai.Set("UpdateTime", source.GetUpdateTime())
|
||||
|
||||
LiveUrl := "https://www.openrec.tv/live/" + anchorId
|
||||
ai.Set("LiveUrl", sql.NullString{String: LiveUrl, Valid: true})
|
||||
|
||||
Uid, err := collect.InsertAnchorInfo(ai)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
return
|
||||
}
|
||||
|
||||
clog.Set("Uid", Uid)
|
||||
clog.Set("Platform", string(intimate.Popenrec))
|
||||
clog.Set("AnchorId", anchorId)
|
||||
clog.Set("UpdateTime", source.GetUpdateTime())
|
||||
|
||||
collect.InsertCollectLog(clog)
|
||||
} else {
|
||||
t.Error("data is not json:\n", string(sdata))
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user