TODO: twitch_task2 fix 错误

This commit is contained in:
eson
2020-09-08 18:24:51 +08:00
parent 28319bf02a
commit fb07d61353
14 changed files with 495 additions and 433 deletions

View File

@@ -70,13 +70,13 @@ func Execute() {
if userid := room.Get("id").String(); userid != "" {
streamer.UserId = userid
streamer.LiveUrl = sql.NullString{String: "https://www.nimo.tv/live/" + userid, Valid: true}
streamer.UserId = &userid
streamer.LiveUrl = &sql.NullString{String: "https://www.nimo.tv/live/" + userid, Valid: true}
channel := room.Get("roomTypeName").String()
streamer.Channel = sql.NullString{String: channel, Valid: channel != ""}
streamer.Channel = &sql.NullString{String: channel, Valid: channel != ""}
username := room.Get("anchorName").String()
streamer.UserName = sql.NullString{String: username, Valid: username != ""}
streamer.UserName = &sql.NullString{String: username, Valid: username != ""}
if rtags := room.Get("anchorLabels"); rtags.IsArray() {

View File

@@ -11,11 +11,11 @@ import (
"github.com/tidwall/gjson"
)
// sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql
var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STOpenrec))
// // sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql
// var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STOpenrec))
// estore 解析存储连接实例
var estore *intimate.StoreExtractor = intimate.NewStoreExtractor()
// // estore 解析存储连接实例
// var estore *intimate.StoreExtractor = intimate.NewStoreExtractor()
// Execute 执行方法
func Execute() {
@@ -71,7 +71,7 @@ func Execute() {
userid := User.Get("channel.id").String()
streamer := &intimate.Streamer{}
streamer.UserId = userid
streamer.UserId = &userid
streamer.Platform = intimate.Popenrec
updateUrl := make(map[string]interface{})
@@ -83,15 +83,16 @@ func Execute() {
updateUrlBytes, err := json.Marshal(updateUrl)
if err != nil {
estore.UpdateError(streamer, err)
intimate.TStreamer.UpdateError(streamer, err)
continue
}
streamer.UpdateUrl = updateUrlBytes
estore.InsertStreamer(streamer)
intimate.TStreamer.Insert(streamer)
}
}
log.Println("streamer count:", len(result.Array()), tp.ParsedURL.String())
// 修改url query 参数的page递增. 遍历所有页面
tp.QueryParam("page").IntAdd(1)
time.Sleep(time.Second * 1)

View File

@@ -1,2 +0,0 @@
openrec_task2
log

View File

@@ -1,5 +0,0 @@
package main
func main() {
Execute()
}

View File

@@ -1,154 +0,0 @@
package main
import (
"database/sql"
"encoding/json"
"intimate"
"log"
"time"
"github.com/474420502/gcurl"
"github.com/474420502/requests"
"github.com/tidwall/gjson"
)
// sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql
var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STOpenrec))
// estore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_extractor.sql
var estore *intimate.StoreExtractor = intimate.NewStoreExtractor()
func init() {
}
// Execute 执行方法
func Execute() {
ps := intimate.NewPerfectShutdown()
ses := requests.NewSession()
var lasterr error = nil
for !ps.IsClose() {
streamer, err := estore.Pop(intimate.Popenrec) //队列里弹出一个streamer行. 进行解析
if streamer == nil || err != nil {
if err != lasterr {
log.Println(err, lasterr)
lasterr = err
}
time.Sleep(time.Second * 2)
continue
}
userId := streamer.UserId
var updateUrl map[string]string
err = json.Unmarshal(streamer.UpdateUrl.([]byte), &updateUrl) // 反序列化update_url, 里面存了需要采集的url
if err != nil {
log.Println(err)
continue
}
// Check Userid
userUrl := updateUrl["user"]
log.Println(userUrl)
tp := ses.Get(userUrl) // 获取user url页面数据
resp, err := tp.Execute()
streamer.UpdateTime = sql.NullTime{Time: time.Now(), Valid: true}
if err != nil {
log.Println(err)
estore.UpdateError(streamer, err)
continue
}
cookies := ses.GetCookies(tp.GetParsedURL())
scurl := updateUrl["supporters"] //获取打赏者的数据
curl := gcurl.Parse(scurl)
supportersSession := curl.CreateSession()
temporary := curl.CreateTemporary(supportersSession)
supportersSession.SetCookies(temporary.GetParsedURL(), cookies)
var supporters []string
for { // supporters 数据需要登录信息. 下面为赋值 supporters链接获取的uid token random码
supportersQuery := temporary.GetQuery()
for _, cookie := range cookies {
if cookie.Name == "uuid" {
supportersQuery.Set("Uuid", cookie.Value)
continue
}
if cookie.Name == "token" {
supportersQuery.Set("Token", cookie.Value)
continue
}
if cookie.Name == "random" {
supportersQuery.Set("Random", cookie.Value)
continue
}
}
supportersQuery.Set("identify_id", userId)
temporary.SetQuery(supportersQuery)
resp, err := temporary.Execute()
if err != nil {
log.Println(err)
}
supporterjson := gjson.ParseBytes(resp.Content())
supporterdata := supporterjson.Get("data") //解析supporters获取的json数据
if supporterdata.Type == gjson.Null {
break
}
supporters = append(supporters, string(resp.Content()))
temporary.QueryParam("page_number").IntAdd(1)
}
// cookies := cxt.Session().GetCookies(wf.GetParsedURL())
ext := make(map[string]interface{})
ext["json_supporters"] = supporters
ext["html_user"] = string(resp.Content())
liveUrl := updateUrl["live"]
tp = ses.Get(liveUrl)
resp, err = tp.Execute()
if err != nil {
log.Println(err)
estore.UpdateError(streamer, err)
continue
}
ext["html_live"] = string(resp.Content())
ext["var_user_id"] = userId
extJsonBytes, err := json.Marshal(ext)
if err != nil {
log.Println(err)
estore.UpdateError(streamer, err)
continue
}
// streamer.Platform = intimate.Popenrec
streamer.UpdateInterval = 120
streamer.UpdateTime = sql.NullTime{Time: time.Now(), Valid: true}
streamer.Operator = 0
source := &intimate.Source{}
source.Target = intimate.TOpenrecUser
source.Ext = string(extJsonBytes)
source.StreamerId = sql.NullInt64{Int64: streamer.Uid, Valid: true}
sstore.Insert(source)
estore.UpdateStreamer(streamer)
}
}

View File

@@ -1,9 +0,0 @@
package main
import (
"testing"
)
func TestMain(t *testing.T) {
main()
}

View File

@@ -75,8 +75,10 @@ func Execute() {
sl.Operator = 0
sl.UpdateInterval = 120
sl.UpdateTime = time.Now()
sl.UrlHash = intimate.GetUrlHash(sl.Url)
estore.InsertStreamerList(sl)
intimate.TStreamerList.Insert(sl)
// estore.InsertStreamerList(sl)
queue.Put(wurl)
queuedict[wurl] = true
@@ -107,7 +109,8 @@ func Execute() {
sl.Operator = 0
sl.UpdateInterval = 120
sl.UpdateTime = time.Now()
estore.InsertStreamerList(sl)
sl.UrlHash = intimate.GetUrlHash(sl.Url)
intimate.TStreamerList.Insert(sl)
queue.Put(wurl)
queuedict[wurl] = true

View File

@@ -1,7 +1,6 @@
package main
import (
"database/sql"
"intimate"
"log"
"time"
@@ -9,100 +8,120 @@ import (
"github.com/tebeka/selenium"
)
// sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql
var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STTwitch))
// // sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql
// var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STTwitch))
// estore 解析存储连接实例
var estore *intimate.StoreExtractor = intimate.NewStoreExtractor()
// // estore 解析存储连接实例
// var estore *intimate.StoreExtractor = intimate.NewStoreExtractor()
// 获取类型的所有频道链接
// Execute 执行任务
func Execute() {
var err error
wd := intimate.GetChromeDriver(3030)
ps := intimate.NewPerfectShutdown()
weburl := "https://www.twitch.tv/directory?sort=VIEWER_COUNT"
err = wd.Get(weburl)
if err != nil {
panic(err)
}
for !ps.IsClose() {
var err error
wd := intimate.GetChromeDriver(3030)
cardCondition := func(wd selenium.WebDriver) (bool, error) {
elements, err := wd.FindElements(selenium.ByXPATH, "//span/a[contains(@data-a-target,'card-') and @href]")
weburl := "https://www.twitch.tv/directory?sort=VIEWER_COUNT"
err = wd.Get(weburl)
if err != nil {
return false, err
}
return len(elements) > 0, nil
}
wd.WaitWithTimeout(cardCondition, time.Second*15)
time.Sleep(time.Second)
e, err := wd.FindElement(selenium.ByXPATH, "//button[@data-a-target='browse-sort-menu']")
if err != nil {
panic(err)
}
e.Click()
var hrefs map[string]bool = make(map[string]bool)
var delayerror = 5
for i := 0; i <= 200; i++ {
cards, err := wd.FindElements(selenium.ByXPATH, "//span/a[contains(@data-a-target,'card-') and @href]")
if err != nil {
log.Println(err)
break
panic(err)
}
if len(hrefs) == 0 {
delayerror--
if delayerror <= 0 {
cardCondition := func(wd selenium.WebDriver) (bool, error) {
elements, err := wd.FindElements(selenium.ByXPATH, "//span/a[contains(@data-a-target,'card-') and @href]")
if err != nil {
return false, err
}
return len(elements) > 0, nil
}
wd.WaitWithTimeout(cardCondition, time.Second*15)
time.Sleep(time.Second)
e, err := wd.FindElement(selenium.ByXPATH, "//button[@data-a-target='browse-sort-menu']")
if err != nil {
panic(err)
}
e.Click()
var lasthreflen = 0
var hrefs map[string]bool = make(map[string]bool)
var delayerror = 5
for i := 0; i <= 200; i++ {
cards, err := wd.FindElements(selenium.ByXPATH, "//span/a[contains(@data-a-target,'card-') and @href]")
if err != nil {
log.Println(err)
break
}
} else {
delayerror = 5
}
for ii := 0; ii < 10; ii++ {
for _, card := range cards {
href, err := card.GetAttribute("href")
if err != nil {
log.Println(href, err)
continue
} else {
hrefs[href] = true
if len(hrefs) == lasthreflen {
delayerror--
if delayerror <= 0 {
break
}
} else {
delayerror = 7
}
break
}
lasthreflen = len(hrefs)
if ps.IsClose() {
break
}
for ii := 0; ii < 10; ii++ {
for _, card := range cards {
href, err := card.GetAttribute("href")
if err != nil {
log.Println(href, err)
continue
} else {
hrefs[href] = true
}
}
break
}
if len(cards) > 10 {
log.Println(len(cards))
wd.ExecuteScript(`items = document.evaluate("//div[@data-target='directory-page__card-container']/../self::div[@data-target and @style]", document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
if ps.IsClose() {
break
}
if len(cards) > 10 {
log.Println(len(cards))
wd.ExecuteScript(`items = document.evaluate("//div[@data-target='directory-page__card-container']/../self::div[@data-target and @style]", document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
for (var i = 0; i < items.snapshotLength - 10; i++) { item = items.snapshotItem(i); item.remove() ;};`, nil)
}
time.Sleep(time.Millisecond * 200)
wd.KeyDown(selenium.EndKey)
time.Sleep(time.Millisecond * 200)
wd.KeyUp(selenium.EndKey)
time.Sleep(time.Millisecond * 2500)
}
time.Sleep(time.Millisecond * 200)
wd.KeyDown(selenium.EndKey)
time.Sleep(time.Millisecond * 200)
wd.KeyUp(selenium.EndKey)
time.Sleep(time.Millisecond * 2500)
for href := range hrefs {
sl := &intimate.StreamerList{}
sl.Url = href
sl.UrlHash = intimate.GetUrlHash(sl.Url)
sl.Platform = string(intimate.Ptwitch)
sl.UpdateTime = intimate.GetUpdateTimeNow()
err := intimate.TStreamerList.Insert(sl)
if err != nil {
log.Println(err)
}
// TODO: Save href
// source := &intimate.Source{}
// source.Source = sql.NullString{String: href, Valid: true}
// source.Operator = 0
// source.Target = intimate.TTwitchChannel
// source.Url = weburl
// sstore.Insert(source)
}
log.Println("hrefs len:", len(hrefs))
// sstore.Deduplicate(intimate.TTwitchChannel, "source")
wd.Close()
wd.Quit()
time.Sleep(time.Minute * 30)
}
for href := range hrefs {
// TODO: Save href
source := &intimate.Source{}
source.Source = sql.NullString{String: href, Valid: true}
source.Operator = 0
source.Target = intimate.TTwitchChannel
source.Url = weburl
sstore.Insert(source)
}
log.Println("hrefs len:", len(hrefs))
sstore.Deduplicate(intimate.TTwitchChannel, "source")
}