完成解析模块示例. 数据库入库测试.

TODO: 调整程序启动停止(非暴力关闭).
This commit is contained in:
eson
2020-07-16 15:25:55 +08:00
parent 51fe6f6039
commit 7e3b36c7d0
7 changed files with 361 additions and 322 deletions

View File

@@ -0,0 +1,215 @@
package main
import (
"database/sql"
"encoding/json"
"intimate"
"log"
"regexp"
"strconv"
"strings"
"time"
"github.com/tidwall/gjson"
)
// OpenrecExtractor 提取方法
type OpenrecExtractor struct {
user *intimate.ExtractorSource
userLive *intimate.ExtractorSource
supporters *intimate.ExtractorSource
}
func (oe *OpenrecExtractor) extractFollowers(clog intimate.ISet) {
extractor := oe.user.GetExtractor()
xp, err := extractor.XPathResult("//p[@class='c-global__user__count__row__right js-userCountFollowers']/text()")
if err != nil {
log.Println(err)
}
if !xp.NodeIter().Next() {
log.Println("不存在粉丝数")
}
followers := strings.ReplaceAll(xp.String(), ",", "")
followersInt, err := strconv.ParseInt(followers, 10, 64)
if err != nil {
log.Println(err)
}
clog.Set("Followers", sql.NullInt64{Int64: followersInt, Valid: true})
}
func (oe *OpenrecExtractor) extractAnchorName(ai intimate.ISet) {
extractor := oe.user.GetExtractor()
xp, err := extractor.XPathResult("//p[@class='c-global__user__profile__list__name__text official-icon--after']/text()")
if xp.NodeIter().Next() {
anchorName := xp.String()
ai.Set("AnchorName", anchorName)
} else {
log.Println(err)
}
}
func (oe *OpenrecExtractor) extractViewsAndLiveStreaming(clog intimate.ISet) {
extractor := oe.user.GetExtractor()
// c-contents
xp, err := extractor.XPathResult("//ul[@class='c-contents']//p[@class='c-thumbnailVideo__footer__liveCount']/text()")
if err != nil {
log.Println(err)
}
if xp.NodeIter().Next() {
views := regexp.MustCompile(`[0-9,]+`).FindString(xp.String())
views = strings.ReplaceAll(views, ",", "")
viewsint, err := strconv.Atoi(views)
if err != nil {
log.Println(err)
}
clog.Set("Views", sql.NullInt64{Int64: int64(viewsint), Valid: true})
clog.Set("IsLiveStreaming", int32(1))
}
}
func (oe *OpenrecExtractor) extractGiversAndGratuity(clog intimate.ISet) {
// extractor := oe.user.GetExtractor()
giverjson := oe.supporters.GetSource()
var givers []interface{}
var gratuity int64 = 0
for _, v := range giverjson.Array() {
giverSource := gjson.Parse(v.String())
for _, item := range giverSource.Get("data.items").Array() {
givers = append(givers, item.Map())
gratuity += item.Get("total_yells").Int()
}
}
giversbytes, err := json.Marshal(givers)
if err != nil {
log.Println(err)
clog.Set("ErrorMsg", sql.NullString{String: err.Error(), Valid: true})
} else {
clog.Set("Giver", giversbytes)
}
clog.Set("Gratuity", sql.NullInt64{Int64: gratuity, Valid: true})
}
func (oe *OpenrecExtractor) extractLive(clog intimate.ISet) {
extractor := oe.userLive.GetExtractor()
mathes := regexp.MustCompile("MovieTitle__Title[^>]+>(.{1,50})</h1>").FindStringSubmatch(oe.userLive.GetSource().Str)
if len(mathes) == 2 {
clog.Set("LiveTitle", sql.NullString{String: mathes[1], Valid: true})
content, err := extractor.XPathResult("//meta[@itemprop='uploadDate']/@content")
if err != nil {
log.Println(err)
}
iter := content.NodeIter()
if iter.Next() {
tm, err := time.ParseInLocation("2006-01-02T15:04:05Z07:00", iter.Node().NodeValue(), time.Local)
if err != nil {
log.Println(err)
}
clog.Set("LiveStartTime", sql.NullTime{Time: tm.Local(), Valid: true})
duration, err := extractor.XPathResult("//meta[@itemprop='duration']/@content")
if err != nil {
log.Println(err)
}
diter := duration.NodeIter()
if diter.Next() {
dt, err := intimate.ParseDuration(diter.Node().NodeValue())
if err != nil {
log.Println(err)
}
endtm := tm.Add(dt)
clog.Set("LiveEndTime", sql.NullTime{Time: endtm.Local(), Valid: true})
}
}
}
}
func (oe *OpenrecExtractor) extractTags(clog intimate.ISet) {
var tags []string
matheslist := regexp.MustCompile(`TagButton__Button[^>]+>(.{1,100})</a`).FindAllStringSubmatch(oe.userLive.GetSource().Str, -1)
for _, m := range matheslist {
tags = append(tags, m[1])
}
log.Println(tags)
tagsBytes, err := json.Marshal(tags)
if err != nil {
log.Println(err)
}
clog.Set("Tags", tagsBytes)
}
func (oe *OpenrecExtractor) Execute() {
collect := intimate.NewExtractorStore()
store := intimate.NewSourceStore("source_openrec")
for {
source, err := store.Pop(string(intimate.TTOpenrecRanking), 100)
if err != nil {
log.Println(err)
return
}
anchorId := source.GetSource().String
ai := &intimate.AnchorInfo{}
ai.SetAnchorId(anchorId)
ai.SetPlatform(string(intimate.Popenrec))
sdata := source.GetExt().([]byte)
if gjson.ValidBytes(sdata) {
result := gjson.ParseBytes(sdata)
datamap := result.Map()
oe := &OpenrecExtractor{}
oe.user = intimate.NewExtractorSource(datamap["user"])
oe.user.CreateExtractor()
oe.userLive = intimate.NewExtractorSource(datamap["user_live"])
oe.userLive.CreateExtractor()
oe.supporters = intimate.NewExtractorSource(datamap["supporters"])
clog := &intimate.CollectLog{}
oe.extractFollowers(clog)
oe.extractAnchorName(ai)
oe.extractViewsAndLiveStreaming(clog)
oe.extractGiversAndGratuity(clog)
oe.extractLive(clog)
oe.extractTags(clog)
ai.Set("UpdateTime", source.GetUpdateTime())
LiveUrl := "https://www.openrec.tv/live/" + anchorId
ai.Set("LiveUrl", sql.NullString{String: LiveUrl, Valid: true})
Uid, err := collect.InsertAnchorInfo(ai)
if err != nil {
log.Println(err)
return
}
clog.Set("Uid", Uid)
clog.Set("Platform", string(intimate.Popenrec))
clog.Set("AnchorId", anchorId)
clog.Set("UpdateTime", source.GetUpdateTime())
collect.InsertCollectLog(clog)
} else {
log.Println("data is not json:\n", string(sdata))
}
}
}

View File

@@ -2,19 +2,14 @@ package main
import (
"database/sql"
"encoding/json"
"intimate"
"io/ioutil"
"log"
"os"
"regexp"
"strconv"
"strings"
"testing"
"time"
"github.com/474420502/hunter"
"github.com/474420502/requests"
"github.com/lestrrat-go/libxml2"
"github.com/tidwall/gjson"
)
@@ -100,187 +95,66 @@ func TestCase(t *testing.T) {
}
func TestExtractor(t *testing.T) {
ses := requests.NewSession()
tp := ses.Get("https://www.openrec.tv/user/Riowh/supporters")
tp.Execute()
// t.Error(ses.GetCookies(wf.GetParsedURL()))
collect := intimate.NewExtractorStore()
store := intimate.NewSourceStore("source_openrec")
source, err := store.Pop(string(intimate.TTOpenrecRanking), 100)
if err != nil {
log.Println(err)
return
}
anchorId := source.GetSource().String
ai := &intimate.AnchorInfo{}
ai.SetAnchorId(anchorId)
ai.SetPlatform(string(intimate.Popenrec))
sdata := source.GetExt().([]byte)
if gjson.ValidBytes(sdata) {
result := gjson.ParseBytes(sdata)
m := result.Map()
user := m["user"]
clog := &intimate.CollectLog{}
extractor := hunter.NewExtractor([]byte(user.Str))
xp, err := extractor.XPathResult("//p[@class='c-global__user__count__row__right js-userCountFollowers']/text()")
if err != nil {
t.Error(err)
}
if !xp.NodeIter().Next() {
t.Error("不存在粉丝数")
}
followers := strings.ReplaceAll(xp.String(), ",", "")
followersInt, err := strconv.ParseInt(followers, 10, 64)
if err != nil {
t.Error(err)
}
var anchorName string
xp, err = extractor.XPathResult("//p[@class='c-global__user__profile__list__name__text official-icon--after']/text()")
if xp.NodeIter().Next() {
anchorName = xp.String()
} else {
t.Error(err)
}
t.Error(source.GetSource())
t.Error(anchorName)
ai.SetAnchorName(anchorName)
// c-contents
xp, err = extractor.XPathResult("//ul[@class='c-contents']//p[@class='c-thumbnailVideo__footer__liveCount']/text()")
for {
source, err := store.Pop(string(intimate.TTOpenrecRanking), 100)
if err != nil {
log.Println(err)
}
if xp.NodeIter().Next() {
views := regexp.MustCompile(`[0-9,]+`).FindString(xp.String())
views = strings.ReplaceAll(views, ",", "")
viewsint, err := strconv.Atoi(views)
if err != nil {
t.Error(err)
}
clog.SetViews(sql.NullInt64{Int64: int64(viewsint), Valid: true})
clog.SetIsShowing(1)
}
var givers []interface{}
var gratuity int64 = 0
giverjson := m["supporters"]
for _, v := range giverjson.Array() {
giverSource := gjson.Parse(v.String())
for _, item := range giverSource.Get("data.items").Array() {
givers = append(givers, item.Map())
gratuity += item.Get("total_yells").Int()
}
}
giversbytes, err := json.Marshal(givers)
if err != nil {
t.Error(err)
clog.SetErrorMsg(sql.NullString{String: err.Error(), Valid: true})
} else {
clog.SetGiver(giversbytes)
}
// MovieToolbar__Views-g5e6ic-13 iDRGyA
livejson := m["user_live"]
f, err := os.OpenFile("./test.html", os.O_CREATE|os.O_TRUNC|os.O_RDWR, os.ModePerm)
if err != nil {
panic(err)
}
f.WriteString(livejson.String())
extractor = hunter.NewExtractor([]byte(livejson.Str))
// xr, err := extractor.XPathResult("//h1[ contains(@class, 'MovieTitle__Title')]")
// if err != nil {
// t.Error(err)
// }
mathes := regexp.MustCompile("MovieTitle__Title[^>]+>(.{1,50})</h1>").FindStringSubmatch(livejson.Str)
if len(mathes) == 2 {
clog.SetShowTitle(sql.NullString{String: mathes[1], Valid: true})
content, err := extractor.XPathResult("//meta[@itemprop='uploadDate']/@content")
if err != nil {
t.Error(err)
}
iter := content.NodeIter()
if iter.Next() {
tm, err := time.ParseInLocation("2006-01-02T15:04:05Z07:00", iter.Node().NodeValue(), time.Local)
if err != nil {
t.Error(err)
}
clog.SetShowStartTime(sql.NullTime{Time: tm.Local(), Valid: true})
duration, err := extractor.XPathResult("//meta[@itemprop='duration']/@content")
if err != nil {
t.Error(err)
}
diter := duration.NodeIter()
if diter.Next() {
dt, err := intimate.ParseDuration(diter.Node().NodeValue())
if err != nil {
log.Println(err)
}
endtm := tm.Add(dt)
clog.SetShowEndTime(sql.NullTime{Time: endtm.Local(), Valid: true})
}
}
}
var tags []string
matheslist := regexp.MustCompile(`TagButton__Button[^>]+>(.{1,100})</a`).FindAllStringSubmatch(livejson.Str, -1)
for _, m := range matheslist {
tags = append(tags, m[1])
}
t.Error(tags)
tagsBytes, err := json.Marshal(tags)
if err != nil {
log.Println(err)
}
ai.SetTags(tagsBytes)
ai.SetUpdateTime(source.GetUpdateTime())
LiveUrl := "https://www.openrec.tv/live/" + anchorId
ai.SetLiveUrl(sql.NullString{String: LiveUrl, Valid: true})
Uid, err := collect.InsertAnchorInfo(ai)
if err != nil {
t.Error(err)
return
}
clog.SetUid(Uid)
clog.SetTags(tagsBytes)
clog.SetGratuity(sql.NullInt64{Int64: gratuity, Valid: true})
clog.SetPlatform(string(intimate.Popenrec))
clog.SetFollowers(sql.NullInt64{Int64: int64(followersInt), Valid: true})
clog.SetAnchorId(anchorId)
clog.SetUpdateTime(source.GetUpdateTime())
anchorId := source.GetSource().String
collect.InsertCollectLog(clog)
ai := &intimate.AnchorInfo{}
ai.SetAnchorId(anchorId)
ai.SetPlatform(string(intimate.Popenrec))
} else {
t.Error("data is not json:\n", string(sdata))
sdata := source.GetExt().([]byte)
if gjson.ValidBytes(sdata) {
result := gjson.ParseBytes(sdata)
datamap := result.Map()
oe := &OpenrecExtractor{}
oe.user = intimate.NewExtractorSource(datamap["user"])
oe.user.CreateExtractor()
oe.userLive = intimate.NewExtractorSource(datamap["user_live"])
oe.userLive.CreateExtractor()
oe.supporters = intimate.NewExtractorSource(datamap["supporters"])
clog := &intimate.CollectLog{}
oe.extractFollowers(clog)
oe.extractAnchorName(ai)
oe.extractViewsAndLiveStreaming(clog)
oe.extractGiversAndGratuity(clog)
oe.extractLive(clog)
oe.extractTags(clog)
ai.Set("UpdateTime", source.GetUpdateTime())
LiveUrl := "https://www.openrec.tv/live/" + anchorId
ai.Set("LiveUrl", sql.NullString{String: LiveUrl, Valid: true})
Uid, err := collect.InsertAnchorInfo(ai)
if err != nil {
t.Error(err)
return
}
clog.Set("Uid", Uid)
clog.Set("Platform", string(intimate.Popenrec))
clog.Set("AnchorId", anchorId)
clog.Set("UpdateTime", source.GetUpdateTime())
collect.InsertCollectLog(clog)
} else {
t.Error("data is not json:\n", string(sdata))
}
}
}