From ea650f91dca37497c02f4ea777e6fe872050d18f Mon Sep 17 00:00:00 2001 From: eson Date: Fri, 24 Jul 2020 18:48:33 +0800 Subject: [PATCH] TODO: twitch get all userid list --- platform_list.go | 5 +- store.go | 8 ++ target_type_list.go | 3 + tasks/twitch/twitch_task1/main.go | 6 + tasks/twitch/twitch_task1/task_twitch.go | 31 +---- tasks/twitch/twitch_task1/task_twitch_test.go | 4 + tasks/twitch/twitch_task2/main.go | 6 + tasks/twitch/twitch_task2/task_twitch.go | 114 ++++++++++++++++++ tasks/twitch/twitch_task2/task_twitch_test.go | 7 ++ utils.go | 32 +++++ 10 files changed, 189 insertions(+), 27 deletions(-) create mode 100644 tasks/twitch/twitch_task1/main.go create mode 100644 tasks/twitch/twitch_task2/main.go create mode 100644 tasks/twitch/twitch_task2/task_twitch.go create mode 100644 tasks/twitch/twitch_task2/task_twitch_test.go diff --git a/platform_list.go b/platform_list.go index d694917..0bdefc0 100644 --- a/platform_list.go +++ b/platform_list.go @@ -4,6 +4,9 @@ package intimate type Platform string const ( - // Popenrec openrec源table名称 + // Popenrec openrec 平台 Popenrec Platform = "openrec" + + // Ptwitch twitch 平台 + Ptwitch Platform = "twitch" ) diff --git a/store.go b/store.go index e3e1625..2c07234 100644 --- a/store.go +++ b/store.go @@ -85,6 +85,14 @@ func (store *StoreSource) Insert(isource IGet) { } } +// Deduplicate 去重 +func (store *StoreSource) Deduplicate(target Target, field string) { + _, err := store.db.Exec(`DELETE FROM ` + store.table + ` WHERE uid NOT IN (SELECT MAX(s.uid) FROM (SELECT uid, ` + field + ` FROM ` + store.table + `force index(target_type_idx) WHERE target_type = "` + string(target) + `" ) s GROUP BY s.` + string(target) + `) ;`) + if err != nil { + panic(err) + } +} + // Update 更新数据 func (store *StoreSource) Update(isource IGet) { _, err := store.db.Exec("update "+store.table+" set ext = ?, pass_gob = ?, operator = ?, error_msg = ? where uid = ?", isource.Get("Ext"), isource.Get("PassGob"), isource.Get("Operator"), isource.Get("ErrorMsg"), isource.Get("Uid")) diff --git a/target_type_list.go b/target_type_list.go index e21a40a..941e05d 100644 --- a/target_type_list.go +++ b/target_type_list.go @@ -12,4 +12,7 @@ const ( // TTwitchChannel twitch 获取类别操作目标 TTwitchChannel Target = "twitch_channel" + + // TTwitchUser twitch 获取类别操作目标 + TTwitchUser Target = "twitch_user" ) diff --git a/tasks/twitch/twitch_task1/main.go b/tasks/twitch/twitch_task1/main.go new file mode 100644 index 0000000..b0019dd --- /dev/null +++ b/tasks/twitch/twitch_task1/main.go @@ -0,0 +1,6 @@ +package main + +func main() { + e := ChannelLink{} + e.Execute() +} diff --git a/tasks/twitch/twitch_task1/task_twitch.go b/tasks/twitch/twitch_task1/task_twitch.go index 2fe5560..4f0aa50 100644 --- a/tasks/twitch/twitch_task1/task_twitch.go +++ b/tasks/twitch/twitch_task1/task_twitch.go @@ -2,13 +2,11 @@ package main import ( "database/sql" - "fmt" "intimate" "log" "time" "github.com/tebeka/selenium" - "github.com/tebeka/selenium/chrome" ) // sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql @@ -25,32 +23,10 @@ type ChannelLink struct { // Execute 执行任务 func (cl *ChannelLink) Execute() { - caps := selenium.Capabilities{"browserName": "chrome"} - chromecaps := chrome.Capabilities{} - err := chromecaps.AddExtension("/home/eson/test/ssh-key/0.1.2_0.crx") - if err != nil { - panic(err) - } - chromecaps.Args = append(chromecaps.Args, "--disk-cache-dir=/home/eson/test/ssh-key/cache") - chromecaps.ExcludeSwitches = append(chromecaps.ExcludeSwitches, "enable-automation") - caps.AddChrome(chromecaps) - _, err = selenium.NewChromeDriverService("/usr/bin/chromedriver", 3030) - if err != nil { - panic(err) - } - wd, err := selenium.NewRemote(caps, fmt.Sprintf("http://localhost:%d/wd/hub", 3030)) - defer func() { - if err := wd.Close(); err != nil { - log.Println(err) - } - }() - wd.ExecuteScript("windows.navigator.webdriver = undefined", nil) - if err != nil { - panic(err) - } + var err error + wd := intimate.GetChromeDriver(3030) weburl := "https://www.twitch.tv/directory?sort=VIEWER_COUNT" - err = wd.Get(weburl) if err != nil { panic(err) @@ -81,6 +57,7 @@ func (cl *ChannelLink) Execute() { if err != nil { panic(err) } + // xpath: //article//a[@data-a-target='preview-card-title-link'] for _, ele := range elements { href, err := ele.GetAttribute("href") if err != nil { @@ -94,4 +71,6 @@ func (cl *ChannelLink) Execute() { source.Url = weburl sstore.Insert(source) } + + sstore.Deduplicate(intimate.TTwitchChannel, "source") } diff --git a/tasks/twitch/twitch_task1/task_twitch_test.go b/tasks/twitch/twitch_task1/task_twitch_test.go index 71666a7..e61dc1e 100644 --- a/tasks/twitch/twitch_task1/task_twitch_test.go +++ b/tasks/twitch/twitch_task1/task_twitch_test.go @@ -8,3 +8,7 @@ func TestCase1(t *testing.T) { e := ChannelLink{} e.Execute() } + +func TestLiveUrl(t *testing.T) { + +} diff --git a/tasks/twitch/twitch_task2/main.go b/tasks/twitch/twitch_task2/main.go new file mode 100644 index 0000000..d81b18b --- /dev/null +++ b/tasks/twitch/twitch_task2/main.go @@ -0,0 +1,6 @@ +package main + +func main() { + ul := UserList{} + ul.Execute() +} diff --git a/tasks/twitch/twitch_task2/task_twitch.go b/tasks/twitch/twitch_task2/task_twitch.go new file mode 100644 index 0000000..eb3780a --- /dev/null +++ b/tasks/twitch/twitch_task2/task_twitch.go @@ -0,0 +1,114 @@ +package main + +import ( + "database/sql" + "encoding/json" + "intimate" + "log" + "regexp" + "time" + + "github.com/tebeka/selenium" +) + +// sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql +var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STTwitch)) + +// estore 解析存储连接实例 +var estore *intimate.StoreExtractor = intimate.NewStoreExtractor() + +// 获取类型的所有频道链接 + +// UserList 频道链接 +type UserList struct { +} + +// Execute 执行任务 +func (cl *UserList) Execute() { + // DELETE FROM source_twitch WHERE uid NOT IN (SELECT MAX(s.uid) FROM (SELECT uid, source FROM source_twitch ) s GROUP BY s.source) ; + //article//a[@data-a-target='preview-card-title-link'] + var err error + wd := intimate.GetChromeDriver(3030) + + sourceChannel, err := sstore.Pop(intimate.TTwitchChannel) + if err != nil { + panic(err) + } + + weburl := sourceChannel.Source.String + "?sort=VIEWER_COUNT" + err = wd.Get(weburl) + if err != nil { + panic(err) + } + + wd.WaitWithTimeout(func(wd selenium.WebDriver) (bool, error) { + _, err := wd.FindElement(selenium.ByXPATH, "(//div/p[@class=''])[last()]") + if err != nil { + return false, err + } + return true, nil + }, time.Second*10) + + btn, err := wd.FindElement(selenium.ByXPATH, "//button[@data-a-target='browse-sort-menu']") + if err != nil { + panic(err) + } + btn.Click() + + var elements []selenium.WebElement + var liveurls = 0 + var delayerror = 3 + for i := 0; i < 2; i++ { + elements, err = wd.FindElements(selenium.ByXPATH, "(//div/p[@class=''])[last()]") + if err != nil { + panic(err) + } + wd.KeyDown(selenium.EndKey) + time.Sleep(time.Second * 2) + if len(elements) == liveurls { + delayerror-- + if delayerror <= 0 { + break + } + } else { + delayerror = 3 + } + } + elements, err = wd.FindElements(selenium.ByXPATH, "//article//a[@data-a-target='preview-card-title-link' and @href]") + if err != nil { + panic(err) + } + + for _, e := range elements { + + attr, err := e.GetAttribute("href") + if err != nil { + log.Println(err) + continue + } + streamer := &intimate.Streamer{} + + matches := regexp.MustCompile(`https://www.twitch.tv/(\w+)`).FindStringSubmatch(attr) + if len(matches) == 2 { + streamer.UserId = matches[1] + } else { + log.Println(attr) + continue + } + + streamer.Platform = intimate.Ptwitch + + updateUrl := make(map[string]string) + updateUrl["live"] = attr + streamer.LiveUrl = sql.NullString{String: attr, Valid: true} + data, err := json.Marshal(updateUrl) + if err != nil { + log.Println(err) + continue + } + streamer.UpdateUrl = data + streamer.Operator = 0 + + estore.InsertStreamer(streamer) + } +} diff --git a/tasks/twitch/twitch_task2/task_twitch_test.go b/tasks/twitch/twitch_task2/task_twitch_test.go new file mode 100644 index 0000000..ef64976 --- /dev/null +++ b/tasks/twitch/twitch_task2/task_twitch_test.go @@ -0,0 +1,7 @@ +package main + +import "testing" + +func TestMain(t *testing.T) { + main() +} diff --git a/utils.go b/utils.go index 9df8fbb..64f9efe 100644 --- a/utils.go +++ b/utils.go @@ -1,8 +1,13 @@ package intimate import ( + "fmt" "log" + "runtime" "time" + + "github.com/tebeka/selenium" + "github.com/tebeka/selenium/chrome" ) var zeroTime time.Time @@ -42,3 +47,30 @@ func ParseDuration(dt string) (time.Duration, error) { } return tdt.Sub(zeroTime), nil } + +func GetChromeDriver(port int) selenium.WebDriver { + caps := selenium.Capabilities{"browserName": "chrome"} + chromecaps := chrome.Capabilities{} + err := chromecaps.AddExtension("/home/eson/test/ssh-key/0.1.2_0.crx") + if err != nil { + panic(err) + } + chromecaps.Args = append(chromecaps.Args, "--disk-cache-dir=/tmp/chromedriver-cache") + chromecaps.ExcludeSwitches = append(chromecaps.ExcludeSwitches, "enable-automation") + caps.AddChrome(chromecaps) + _, err = selenium.NewChromeDriverService("/usr/bin/chromedriver", port) + if err != nil { + panic(err) + } + wd, err := selenium.NewRemote(caps, fmt.Sprintf("http://localhost:%d/wd/hub", port)) + runtime.SetFinalizer(wd, func(obj interface{}) { + if err := wd.Close(); err != nil { + log.Println(err) + } + }) + wd.ExecuteScript("windows.navigator.webdriver = undefined", nil) + if err != nil { + panic(err) + } + return wd +}