From 826d15876a3334a5b38d893a6b96e7597c8847ef Mon Sep 17 00:00:00 2001 From: eson Date: Tue, 4 Aug 2020 14:12:00 +0800 Subject: [PATCH 1/6] fix windows quit --- tasks/twitch/twitch_task1/task_twitch.go | 5 ++--- tasks/twitch/twitch_task2/task_twitch.go | 2 +- utils.go | 7 +++---- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/tasks/twitch/twitch_task1/task_twitch.go b/tasks/twitch/twitch_task1/task_twitch.go index 2b385ed..d98b28b 100644 --- a/tasks/twitch/twitch_task1/task_twitch.go +++ b/tasks/twitch/twitch_task1/task_twitch.go @@ -51,7 +51,6 @@ func (cl *ChannelLink) Execute() { var hrefs map[string]bool = make(map[string]bool) var delayerror = 5 - var samecount = 0 for i := 0; i <= 200; i++ { cards, err := wd.FindElements(selenium.ByXPATH, "//span/a[contains(@data-a-target,'card-') and @href]") if err != nil { @@ -59,7 +58,7 @@ func (cl *ChannelLink) Execute() { break } - if len(cards) == samecount { + if len(hrefs) == 0 { delayerror-- if delayerror <= 0 { break @@ -80,7 +79,7 @@ func (cl *ChannelLink) Execute() { } break } - samecount = len(cards) + if ps.IsClose() { break } diff --git a/tasks/twitch/twitch_task2/task_twitch.go b/tasks/twitch/twitch_task2/task_twitch.go index 52773de..95899c0 100644 --- a/tasks/twitch/twitch_task2/task_twitch.go +++ b/tasks/twitch/twitch_task2/task_twitch.go @@ -29,12 +29,12 @@ func (cl *UserList) Execute() { //article//a[@data-a-target='preview-card-title-link'] wd := intimate.GetChromeDriver(3030) + defer wd.Quit() ps := intimate.NewPerfectShutdown() counter := intimate.NewCounter() counter.SetMaxLimit(100) counter.SetMaxToDo(func(olist ...interface{}) error { owd := olist[0].(*selenium.WebDriver) - (*owd).Close() (*owd).Quit() *owd = intimate.GetChromeDriver(3030) return nil diff --git a/utils.go b/utils.go index ebee59b..53734ab 100644 --- a/utils.go +++ b/utils.go @@ -103,10 +103,9 @@ func GetChromeDriver(port int) selenium.WebDriver { panic(err) } runtime.SetFinalizer(wd, func(obj interface{}) { - - if err := obj.(selenium.WebDriver).Close(); err != nil { - log.Println(err) - } + // if err := obj.(selenium.WebDriver).Close(); err != nil { + // log.Println(err) + // } if err := obj.(selenium.WebDriver).Quit(); err != nil { log.Println(err) } From 6158976986656b922432364a9e4beba752fe38bc Mon Sep 17 00:00:00 2001 From: eson Date: Tue, 4 Aug 2020 14:13:39 +0800 Subject: [PATCH 2/6] add twitcasting test --- tasks/twitcasting/twitcasting_task1/main.go | 1 + .../twitcasting_task1/main_test.go | 56 +++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 tasks/twitcasting/twitcasting_task1/main.go create mode 100644 tasks/twitcasting/twitcasting_task1/main_test.go diff --git a/tasks/twitcasting/twitcasting_task1/main.go b/tasks/twitcasting/twitcasting_task1/main.go new file mode 100644 index 0000000..06ab7d0 --- /dev/null +++ b/tasks/twitcasting/twitcasting_task1/main.go @@ -0,0 +1 @@ +package main diff --git a/tasks/twitcasting/twitcasting_task1/main_test.go b/tasks/twitcasting/twitcasting_task1/main_test.go new file mode 100644 index 0000000..4003cf7 --- /dev/null +++ b/tasks/twitcasting/twitcasting_task1/main_test.go @@ -0,0 +1,56 @@ +package main + +import ( + "github.com/474420502/focus/compare" + "github.com/474420502/focus/tree/heap" + + "log" + "testing" + + "github.com/474420502/requests" + "github.com/lestrrat-go/libxml2" +) + +func TestMain(t *testing.T) { + + searchurl := "https://twitcasting.tv/rankingindex.php" + queuedict := make(map[string]bool) + queue := heap.New(compare.String) + queue.Put(searchurl) + queuedict[searchurl] = true + + for surl, ok := queue.Pop(); ok; surl, ok = queue.Pop() { + + ses := requests.NewSession() + resp, err := ses.Get(surl.(string)).Execute() + if err != nil { + panic(err) + } + + doc, err := libxml2.ParseHTML(resp.Content()) + if err != nil { + panic(err) + } + defer doc.Free() + result, err := doc.Find("//*[contains(@class, 'tag')]/@href") + if err != nil { + panic(err) + } + defer result.Free() + + iter := result.NodeIter() + for iter.Next() { + + log.Println(iter.Node().NodeValue()) + wurl := "https://twitcasting.tv" + iter.Node().NodeValue() + if ok := queuedict[wurl]; !ok { + queue.Put(wurl) + queuedict[wurl] = true + } + } + + doc.Find("//div[@class='tw-search-result-row']") + + log.Println("finish remain", queue.Size()) + } +} From 0bff7169ec6ba9f690f6d4ae008b449ab54cf9a4 Mon Sep 17 00:00:00 2001 From: eson Date: Wed, 5 Aug 2020 18:49:47 +0800 Subject: [PATCH 3/6] Extractor upgrade --- extractor_field.go | 29 ++++++++ go.mod | 1 + go.sum | 8 +++ platform_list.go | 3 + sql/intimate_extractor.sql | 19 +++++ store.go | 48 +++++++++++++ table_list.go | 3 + .../twitcasting_task1/main_test.go | 72 ++++++++++++++++--- tasks/twitch/twitch_task2/task_twitch.go | 5 +- utils.go | 7 +- 10 files changed, 180 insertions(+), 15 deletions(-) diff --git a/extractor_field.go b/extractor_field.go index ad5ef30..c091c5b 100644 --- a/extractor_field.go +++ b/extractor_field.go @@ -3,6 +3,7 @@ package intimate import ( "database/sql" "reflect" + "time" "github.com/474420502/hunter" "github.com/tidwall/gjson" @@ -11,6 +12,34 @@ import ( type GetSet struct { } +type StreamerList struct { + UrlHash []byte // + Platform Platform // + Url string // + + Label sql.NullString // + + Serialize interface{} + + UpdateInterval int32 + UpdateTime time.Time // + + ErrorMsg sql.NullString + Operator int32 + + LastOperator int32 +} + +// Get Simple Value +func (sl *StreamerList) Get(field string) interface{} { + return reflect.ValueOf(sl).Elem().FieldByName(field).Interface() +} + +// Set Simple Value +func (sl *StreamerList) Set(field string, value interface{}) { + reflect.ValueOf(sl).Elem().FieldByName(field).Set(reflect.ValueOf(value)) +} + type Streamer struct { Uid int64 // Platform Platform // diff --git a/go.mod b/go.mod index d8b824d..0841fb6 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module intimate go 1.14 require ( + github.com/474420502/extractor v0.2.2 github.com/474420502/focus v0.12.0 github.com/474420502/gcurl v0.1.2 github.com/474420502/hunter v0.3.4 diff --git a/go.sum b/go.sum index a556617..61ec03c 100644 --- a/go.sum +++ b/go.sum @@ -2,12 +2,16 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= cloud.google.com/go v0.41.0/go.mod h1:OauMR7DV8fzvZIl2qg6rkaIhD/vmgk4iwEw/h6ercmg= +github.com/474420502/extractor v0.2.2 h1:hGao2iZt5CEI8oqYjQW938osQdHKgNWL/bwRJQNgHTM= +github.com/474420502/extractor v0.2.2/go.mod h1:OVFijdKLDghigpIYISHzlognL5q8eeVenT2fRhCyFns= github.com/474420502/focus v0.12.0 h1:+icbmj7IEOefvTegHt5EpcHt6WFbe2miIrceUJx2Evo= github.com/474420502/focus v0.12.0/go.mod h1:d0PMjtMxFz1a9HIhwyFPkWa+JF+0LgOrEUfd8iZka6s= github.com/474420502/gcurl v0.1.2 h1:ON9Yz3IgAdtDlFlHfkAJ3aIEBDxH0RiViPE5ST5ohKg= github.com/474420502/gcurl v0.1.2/go.mod h1:hws5q/Ao64bXLLDnldz9VyTQUndTWc/i5DzdEazFfoM= github.com/474420502/hunter v0.3.4 h1:fyLAgI84jWe3IcqsISC53j1w3CXI1FERxX//Potns0M= github.com/474420502/hunter v0.3.4/go.mod h1:pe4Xr/I+2agvq339vS/OZV+EiHAWtpXQs75rioSW9oA= +github.com/474420502/libxml2 v0.0.0-20200803084225-29e441d26406 h1:nLvl2D2y+hxCglLnRmLqwRGwmUsXQt8ga46zGySTU1I= +github.com/474420502/libxml2 v0.0.0-20200803084225-29e441d26406/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= github.com/474420502/requests v1.6.0 h1:f4h4j40eT0P5whhg9LdkotD8CaKjtuDu/vz9iSUkCgY= github.com/474420502/requests v1.6.0/go.mod h1:SLXrQ5dL9c7dkIeKNUCBAjOIt3J9KFCS2RQjWJecNwo= github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= @@ -66,6 +70,8 @@ github.com/rogpeppe/go-charset v0.0.0-20180617210344-2471d30d28b4/go.mod h1:qgYe github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.6.1 h1:hDPOHmpOpP40lSULcqw7IrRb/u7w6RpDC9399XyoNd0= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/tebeka/selenium v0.9.9 h1:cNziB+etNgyH/7KlNI7RMC1ua5aH1+5wUlFQyzeMh+w= github.com/tebeka/selenium v0.9.9/go.mod h1:5Fr8+pUvU6B1OiPfkdCKdXZyr5znvVkxuPd0NOdZCQc= github.com/tidwall/gjson v1.3.2/go.mod h1:P256ACg0Mn+j1RXIDXoss50DeIABTYK1PULOJHhxOls= @@ -162,6 +168,8 @@ gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.3.0 h1:clyUAQHOM3G0M3f5vQj7LuJrETvjVot3Z5el9nffUtU= gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a h1:LJwr7TCTghdatWv40WobzlKXc9c4s8oGa7QKJUtHhWA= diff --git a/platform_list.go b/platform_list.go index 0bdefc0..246e991 100644 --- a/platform_list.go +++ b/platform_list.go @@ -9,4 +9,7 @@ const ( // Ptwitch twitch 平台 Ptwitch Platform = "twitch" + + // Ptwitcasting twitcasting 平台 + Ptwitcasting Platform = "twitcasting" ) diff --git a/sql/intimate_extractor.sql b/sql/intimate_extractor.sql index 91d3e3b..e8ccc1c 100644 --- a/sql/intimate_extractor.sql +++ b/sql/intimate_extractor.sql @@ -1,6 +1,25 @@ create database if not exists `intimate_extractor`; use intimate_extractor; +CREATE TABLE IF NOT EXISTS `streamer_list` ( + `urlhash` varchar(32) NOT NULL COMMENT '平台', + `url` text COMMENT 'url获取streamer列表的url', + `platform` varchar(255) NOT NULL COMMENT '平台', + `label` varchar(255) DEFAULT NULL COMMENT '必须的时候打上标签', + `serialize` blob DEFAULT NULL COMMENT '保存进程的必要计算数据', + + `update_interval` int DEFAULT 120 COMMENT '分钟单位, 默认120分钟, 下次更新的时间间隔', + `update_time` Timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间', + + `error_msg` text DEFAULT NULL COMMENT '错误信息', + `operator` int DEFAULT 0 COMMENT '操作标志位, 根据不同解析方法有不同标志', + + PRIMARY KEY (`urlhash`), + KEY `platform_idx` (`platform`), + KEY `update_time_idx` (`update_time`), + KEY `operator_idx` (`operator`) +) + CREATE TABLE IF NOT EXISTS `streamer` ( `uid` bigint AUTO_INCREMENT COMMENT '自增UID, 便于查询定位', `platform` varchar(255) NOT NULL COMMENT '平台', diff --git a/store.go b/store.go index cbdbbfc..0f34036 100644 --- a/store.go +++ b/store.go @@ -1,8 +1,11 @@ package intimate import ( + "crypto/md5" "database/sql" + "fmt" "log" + "strings" "time" _ "github.com/go-sql-driver/mysql" @@ -182,6 +185,9 @@ const StreamerTable string = "streamer" // CollectLogTable 采集日志表 const CollectLogTable string = "collect_log" +// StreamerListTable 主播表名称 +const StreamerListTable string = "streamer_list" + type StoreExtractor struct { db *sql.DB @@ -267,6 +273,48 @@ func (store *StoreExtractor) Pop(platform Platform, operators ...int32) (*Stream return s, nil } +// UpdateStreamerList streamerlist表, 更新数据 +func (store *StoreExtractor) UpdateStreamerList(streamer IGet, fieldvalues ...interface{}) { + updateSQL := "UPDATE " + StreamerListTable + " SET " + var values []interface{} + for i := 0; i < len(fieldvalues); i += 2 { + field := fieldvalues[i] + values = append(values, fieldvalues[i+1]) + updateSQL += field.(string) + " = ? " + } + updateSQL += "WHERE urlhash = ?" + values = append(values, streamer.Get("UrlHash")) + _, err := store.db.Exec(updateSQL, values...) + if err != nil { + panic(err) + } +} + +// InsertStreamer streamerlist表, 插入数据 +func (store *StoreExtractor) InsertStreamerList(streamerlist IGet) (isExists bool) { + urlstr := streamerlist.Get("Url").(string) + + _, err := store.db.Exec("insert into streamer_list(urlhash, url, platform, label, serialize, update_interval, error_msg, operator) values(?,?,?,?,?,?,?,?)", + fmt.Sprintf("%x", md5.Sum([]byte(urlstr))), + urlstr, + streamerlist.Get("Platform"), + streamerlist.Get("Label"), + streamerlist.Get("Serialize"), + streamerlist.Get("UpdateInterval"), + streamerlist.Get("ErrorMsg"), + streamerlist.Get("Operator"), + ) + + if err != nil { + if !strings.HasPrefix(err.Error(), "Error 1062") { + log.Println(err) + } + return true + } + + return false +} + // InsertStreamer Streamer表, 插入数据 func (store *StoreExtractor) InsertStreamer(streamer IGet) (isExists bool) { // select uid from table where platform = ? and user_id = ? diff --git a/table_list.go b/table_list.go index 04002dc..cb12c70 100644 --- a/table_list.go +++ b/table_list.go @@ -9,4 +9,7 @@ const ( // STTwitch twitch源table名称 STTwitch SourceTable = "source_twitch" + + // STTwitcasting STTwitcasting源table名称 + STTwitcasting SourceTable = "source_twitcasting" ) diff --git a/tasks/twitcasting/twitcasting_task1/main_test.go b/tasks/twitcasting/twitcasting_task1/main_test.go index 4003cf7..2408667 100644 --- a/tasks/twitcasting/twitcasting_task1/main_test.go +++ b/tasks/twitcasting/twitcasting_task1/main_test.go @@ -1,6 +1,10 @@ package main import ( + "intimate" + "time" + + "github.com/474420502/extractor" "github.com/474420502/focus/compare" "github.com/474420502/focus/tree/heap" @@ -8,9 +12,14 @@ import ( "testing" "github.com/474420502/requests" - "github.com/lestrrat-go/libxml2" ) +// sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql +var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STTwitcasting)) + +// estore 解析存储连接实例 +var estore *intimate.StoreExtractor = intimate.NewStoreExtractor() + func TestMain(t *testing.T) { searchurl := "https://twitcasting.tv/rankingindex.php" @@ -27,30 +36,73 @@ func TestMain(t *testing.T) { panic(err) } - doc, err := libxml2.ParseHTML(resp.Content()) + etor := extractor.ExtractXml(resp.Content()) + + // doc, err := libxml2.ParseHTML(resp.Content()) + // if err != nil { + // panic(err) + // } + // defer doc.Free() + + result, err := etor.XPath("//*[contains(@class, 'tag')]/@href") if err != nil { panic(err) } - defer doc.Free() - result, err := doc.Find("//*[contains(@class, 'tag')]/@href") - if err != nil { - panic(err) - } - defer result.Free() + + // result, err := doc.Find("//*[contains(@class, 'tag')]/@href") + // if err != nil { + // panic(err) + // } + // defer result.Free() iter := result.NodeIter() for iter.Next() { - log.Println(iter.Node().NodeValue()) wurl := "https://twitcasting.tv" + iter.Node().NodeValue() if ok := queuedict[wurl]; !ok { + log.Println(wurl) + sl := &intimate.StreamerList{} + sl.Platform = intimate.Ptwitcasting + sl.Url = wurl + sl.Operator = 0 + sl.UpdateInterval = 120 + sl.UpdateTime = time.Now() + estore.InsertStreamerList(sl) queue.Put(wurl) queuedict[wurl] = true } } - doc.Find("//div[@class='tw-search-result-row']") + // doc.Find("//div[@class='tw-search-result-row']") + xps, err := etor.XPaths("//div[@class='tw-search-result-row']") + if err != nil { + log.Println(surl, err) + continue + } + + // xps.ForEachTag(SearchProfile{}) + + // texts, errs := xps.ForEachText(".//span[@class='username']") + // if len(errs) > 0 { + // t.Error(errs) + // } + var splist = xps.ForEachTag(SearchProfile{}) + for _, isp := range splist { + sp := isp.(*SearchProfile) + sp.UserId = sp.LiveUrl[1:] + // log.Println(sp.(SearchProfile)) + } + + for _, isp := range splist { + log.Println(isp.(*SearchProfile)) + } log.Println("finish remain", queue.Size()) } } + +type SearchProfile struct { + UserName string `exp:".//span[@class='username']" method:"Text"` + UserId string // `exp:".//span[@class='fullname']" method:"Text"` + LiveUrl string `exp:".//div[@class='usertext']/a[@href]" method:"Attribute,href Value"` +} diff --git a/tasks/twitch/twitch_task2/task_twitch.go b/tasks/twitch/twitch_task2/task_twitch.go index 95899c0..915a969 100644 --- a/tasks/twitch/twitch_task2/task_twitch.go +++ b/tasks/twitch/twitch_task2/task_twitch.go @@ -29,13 +29,16 @@ func (cl *UserList) Execute() { //article//a[@data-a-target='preview-card-title-link'] wd := intimate.GetChromeDriver(3030) + defer wd.Close() defer wd.Quit() ps := intimate.NewPerfectShutdown() counter := intimate.NewCounter() counter.SetMaxLimit(100) counter.SetMaxToDo(func(olist ...interface{}) error { owd := olist[0].(*selenium.WebDriver) - (*owd).Quit() + if err := (*owd).Quit(); err != nil { + log.Println(err) + } *owd = intimate.GetChromeDriver(3030) return nil }, &wd) diff --git a/utils.go b/utils.go index 53734ab..bad4a08 100644 --- a/utils.go +++ b/utils.go @@ -103,13 +103,12 @@ func GetChromeDriver(port int) selenium.WebDriver { panic(err) } runtime.SetFinalizer(wd, func(obj interface{}) { - // if err := obj.(selenium.WebDriver).Close(); err != nil { - // log.Println(err) - // } + if err := obj.(selenium.WebDriver).Close(); err != nil { + log.Println(err) + } if err := obj.(selenium.WebDriver).Quit(); err != nil { log.Println(err) } - }) wd.ExecuteScript("windows.navigator.webdriver = undefined", nil) if err != nil { From 23fa32b4aed7de40df4c3c1f2e7ddac97a480d85 Mon Sep 17 00:00:00 2001 From: eson Date: Fri, 7 Aug 2020 18:10:22 +0800 Subject: [PATCH 4/6] =?UTF-8?q?TODO:=20=E9=87=8D=E6=9E=84XPath=20=E7=9A=84?= =?UTF-8?q?=E4=BD=BF=E7=94=A8=20shit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.yaml | 4 +- go.mod | 3 +- go.sum | 41 ++++- store.go | 2 +- .../twitcasting_task1/main_test.go | 140 ++++++++++++++---- 5 files changed, 153 insertions(+), 37 deletions(-) diff --git a/config.yaml b/config.yaml index 1e031f8..bf7f89d 100644 --- a/config.yaml +++ b/config.yaml @@ -1,3 +1,3 @@ database: - source_uri: "root:@tcp(127.0.0.1:4000)/intimate_source?parseTime=true&loc=Local" - extractor_uri: "root:@tcp(127.0.0.1:4000)/intimate_extractor?parseTime=true&loc=Local" \ No newline at end of file + source_uri: "root:@tcp(127.0.0.1:4000)/intimate_source?parseTime=true&loc=Local&charset=utf8mb4&collation=utf8mb4_unicode_ci" + extractor_uri: "root:@tcp(127.0.0.1:4000)/intimate_extractor?parseTime=true&loc=Local&charset=utf8mb4&collation=utf8mb4_unicode_ci" \ No newline at end of file diff --git a/go.mod b/go.mod index 0841fb6..406d3f9 100644 --- a/go.mod +++ b/go.mod @@ -3,13 +3,14 @@ module intimate go 1.14 require ( - github.com/474420502/extractor v0.2.2 + github.com/474420502/extractor v0.4.1 github.com/474420502/focus v0.12.0 github.com/474420502/gcurl v0.1.2 github.com/474420502/hunter v0.3.4 github.com/474420502/requests v1.6.0 github.com/go-sql-driver/mysql v1.5.0 github.com/lestrrat-go/libxml2 v0.0.0-20200215080510-6483566f52cb + github.com/stretchr/testify v1.6.1 // indirect github.com/tebeka/selenium v0.9.9 github.com/tidwall/gjson v1.6.0 github.com/tidwall/pretty v1.0.1 // indirect diff --git a/go.sum b/go.sum index 61ec03c..119a3b6 100644 --- a/go.sum +++ b/go.sum @@ -2,16 +2,44 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= cloud.google.com/go v0.41.0/go.mod h1:OauMR7DV8fzvZIl2qg6rkaIhD/vmgk4iwEw/h6ercmg= -github.com/474420502/extractor v0.2.2 h1:hGao2iZt5CEI8oqYjQW938osQdHKgNWL/bwRJQNgHTM= -github.com/474420502/extractor v0.2.2/go.mod h1:OVFijdKLDghigpIYISHzlognL5q8eeVenT2fRhCyFns= +github.com/474420502/extractor v0.3.0 h1:VURhjNFP2kG6DvPZfsRR/3JLYHURvsHazp/JazNYbME= +github.com/474420502/extractor v0.3.0/go.mod h1:thq0UAm30cMLY6+LJHPNRSw/H3ZrMGfmK0rk+HwycvE= +github.com/474420502/extractor v0.3.1 h1:IxOeJziOR3DPrZJhOcbOUzAc/UABmKUYGLdVgxSi9yk= +github.com/474420502/extractor v0.3.1/go.mod h1:thq0UAm30cMLY6+LJHPNRSw/H3ZrMGfmK0rk+HwycvE= +github.com/474420502/extractor v0.3.2 h1:KcgRC0+pNfK803uZjL76pgsfsnlKSMR1nQX6o6y8cVA= +github.com/474420502/extractor v0.3.2/go.mod h1:yQRtpUOeb37tMitCsenURnN2Yas9Jm/5HGFDCO+/20k= +github.com/474420502/extractor v0.3.3 h1:2/rCOEtTVkezGqz7E0D8KKN1QBKlQaihe+UMxNZcwNk= +github.com/474420502/extractor v0.3.3/go.mod h1:8cakB/mW3No6o2I7PtrVHQ35auIgHh0mGIfk1++UZm4= +github.com/474420502/extractor v0.3.4 h1:3lKV5oke46sDAxkiY4KGMeBiYI8hwNkiAa2Sf8B+xPY= +github.com/474420502/extractor v0.3.4/go.mod h1:+biDin5eKLuJQHNbW+HnPYCC+2LL090iCZNxQklB11Y= +github.com/474420502/extractor v0.3.5 h1:uq3SuPY51F1pYvAtnaJtcqtJ+yE7wcaq3LP9DWTtBnQ= +github.com/474420502/extractor v0.3.5/go.mod h1:pKjqYQCZquakvor/d9JJQYrTYInWKaVXjzAg+IM1/tY= +github.com/474420502/extractor v0.3.6 h1:Qsky2YYUCENz3BFzlFOOWykFyDOfigbkkCTnMAkKExE= +github.com/474420502/extractor v0.3.6/go.mod h1:rH+/kx0CS8xpzOBqraisQE1A9vfXAPZZ+091D8HYXvw= +github.com/474420502/extractor v0.3.7 h1:QDBd4mAjf6D+vH98LQ1SJByDTtLago9GDiEvN1oyDJ0= +github.com/474420502/extractor v0.3.7/go.mod h1:v0TAfUw1zNyFCYVqj5xyFVFpoqmqErvAd2SzMzR/yc8= +github.com/474420502/extractor v0.4.0 h1:h6MbrkCBPQ2/+VRAK741oVcZuDhZ2t4USt0MOIf/v2U= +github.com/474420502/extractor v0.4.0/go.mod h1:1oPuXIm7whY+/rU7hxDW3ick4hHc4AdiNqdk5vVWaXs= +github.com/474420502/extractor v0.4.1 h1:WqcwF7gyvGREBrXBAm3fLR7yqxP/P/arq/iHXZvt8Gg= +github.com/474420502/extractor v0.4.1/go.mod h1:1oPuXIm7whY+/rU7hxDW3ick4hHc4AdiNqdk5vVWaXs= github.com/474420502/focus v0.12.0 h1:+icbmj7IEOefvTegHt5EpcHt6WFbe2miIrceUJx2Evo= github.com/474420502/focus v0.12.0/go.mod h1:d0PMjtMxFz1a9HIhwyFPkWa+JF+0LgOrEUfd8iZka6s= github.com/474420502/gcurl v0.1.2 h1:ON9Yz3IgAdtDlFlHfkAJ3aIEBDxH0RiViPE5ST5ohKg= github.com/474420502/gcurl v0.1.2/go.mod h1:hws5q/Ao64bXLLDnldz9VyTQUndTWc/i5DzdEazFfoM= github.com/474420502/hunter v0.3.4 h1:fyLAgI84jWe3IcqsISC53j1w3CXI1FERxX//Potns0M= github.com/474420502/hunter v0.3.4/go.mod h1:pe4Xr/I+2agvq339vS/OZV+EiHAWtpXQs75rioSW9oA= -github.com/474420502/libxml2 v0.0.0-20200803084225-29e441d26406 h1:nLvl2D2y+hxCglLnRmLqwRGwmUsXQt8ga46zGySTU1I= -github.com/474420502/libxml2 v0.0.0-20200803084225-29e441d26406/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= +github.com/474420502/libxml2 v0.0.0-20200806111302-aa4be92ad592 h1:kgvx2MvoMhkrzLVjM6C6RIcshgI80fnq5/LqAnTOMxQ= +github.com/474420502/libxml2 v0.0.0-20200806111302-aa4be92ad592/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= +github.com/474420502/libxml2 v0.0.0-20200807033034-1b43ad443d1d h1:MQduBAgnOCeGVUU+tawJxQLP1/Bgnn7119hGpVb9VFI= +github.com/474420502/libxml2 v0.0.0-20200807033034-1b43ad443d1d/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= +github.com/474420502/libxml2 v0.0.0-20200807033649-9731e0a44bf0 h1:EiO+pSoFk7TTv/TnVFCT/swjWQEeLAZ2wXeXsS+9+kY= +github.com/474420502/libxml2 v0.0.0-20200807033649-9731e0a44bf0/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= +github.com/474420502/libxml2 v0.0.0-20200807034854-eaa2a69a2790 h1:vzHGXv0e7MX+MSZcz4SjRJUfzoUpX96Qf0f48T6dkxk= +github.com/474420502/libxml2 v0.0.0-20200807034854-eaa2a69a2790/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= +github.com/474420502/libxml2 v0.0.0-20200807035356-cd2e51185f4b h1:q9qSCx9gm7gS6Xr2nmKqkiu2FApQJFkqvTsrAzcWXps= +github.com/474420502/libxml2 v0.0.0-20200807035356-cd2e51185f4b/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= +github.com/474420502/libxml2 v0.0.0-20200807040518-4ef6186ae68c h1:UZriMoPoXEA4Mq/yP+36sxwkOC3Jk3nqy2I7e3ZV470= +github.com/474420502/libxml2 v0.0.0-20200807040518-4ef6186ae68c/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= github.com/474420502/requests v1.6.0 h1:f4h4j40eT0P5whhg9LdkotD8CaKjtuDu/vz9iSUkCgY= github.com/474420502/requests v1.6.0/go.mod h1:SLXrQ5dL9c7dkIeKNUCBAjOIt3J9KFCS2RQjWJecNwo= github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= @@ -24,6 +52,9 @@ github.com/Pallinder/go-randomdata v1.1.0 h1:gUubB1IEUliFmzjqjhf+bgkg1o6uoFIkRsP github.com/Pallinder/go-randomdata v1.1.0/go.mod h1:yHmJgulpD2Nfrm0cR9tI/+oAgRqCQQixsA8HyRZfV9Y= github.com/Pallinder/go-randomdata v1.2.0 h1:DZ41wBchNRb/0GfsePLiSwb0PHZmT67XY00lCDlaYPg= github.com/Pallinder/go-randomdata v1.2.0/go.mod h1:yHmJgulpD2Nfrm0cR9tI/+oAgRqCQQixsA8HyRZfV9Y= +github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0= +github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= +github.com/antchfx/xpath v1.1.10/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/blang/semver v3.5.1+incompatible h1:cQNTCjp13qL8KC3Nbxr/y2Bqb63oX6wdnnjpJbkM4JQ= @@ -40,6 +71,7 @@ github.com/go-sql-driver/mysql v1.5.0 h1:ozyZYNQW3x3HtqT1jira07DN2PArx2v7/mN66gG github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b h1:VKtxabqXZkF25pY9ekfRL6a582T4P37/31XEstQ5p58= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y= @@ -109,6 +141,7 @@ golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e h1:3G+cUijn7XD+S4eJFddp53Pv7+slrESplyjG25HgL+k= golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200707034311-ab3426394381 h1:VXak5I6aEWmAXeQjA+QSZzlgNrpq9mjcfDemuexIKsU= golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= diff --git a/store.go b/store.go index 0f34036..e83718f 100644 --- a/store.go +++ b/store.go @@ -346,7 +346,7 @@ func (store *StoreExtractor) InsertStreamer(streamer IGet) (isExists bool) { return true } - _, err = tx.Exec("INSERT INTO "+StreamerTable+"(platform, user_id, update_url, update_time) VALUES(?,?,?,?);", streamer.Get("Platform"), streamer.Get("UserId"), streamer.Get("UpdateUrl"), time.Now().Add(-time.Minute*60)) + _, err = tx.Exec("INSERT INTO "+StreamerTable+"(platform, user_id, update_url, tags, update_time) VALUES(?,?,?,?,?);", streamer.Get("Platform"), streamer.Get("UserId"), streamer.Get("UpdateUrl"), streamer.Get("Tags"), time.Now().Add(-time.Minute*60)) if err != nil { panic(err) } diff --git a/tasks/twitcasting/twitcasting_task1/main_test.go b/tasks/twitcasting/twitcasting_task1/main_test.go index 2408667..7acc76b 100644 --- a/tasks/twitcasting/twitcasting_task1/main_test.go +++ b/tasks/twitcasting/twitcasting_task1/main_test.go @@ -1,7 +1,14 @@ package main import ( + "database/sql" + "encoding/json" "intimate" + "net/http" + "net/url" + "os" + "os/signal" + "syscall" "time" "github.com/474420502/extractor" @@ -11,9 +18,18 @@ import ( "log" "testing" + _ "net/http/pprof" + "github.com/474420502/requests" ) +func Test(t *testing.T) { + rawurl := "https://twitcasting.tv/你好" + u, _ := url.Parse(rawurl) + t.Error(u.EscapedPath()) + t.Error(u.String()) +} + // sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STTwitcasting)) @@ -21,44 +37,74 @@ var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STTwi var estore *intimate.StoreExtractor = intimate.NewStoreExtractor() func TestMain(t *testing.T) { + f, _ := os.OpenFile("./log", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm) + log.SetFlags(log.Llongfile | log.Ltime) + log.SetOutput(f) + go func() { + log.Println(http.ListenAndServe(":4040", nil)) + }() + + homeurl := "https://twitcasting.tv" searchurl := "https://twitcasting.tv/rankingindex.php" queuedict := make(map[string]bool) queue := heap.New(compare.String) queue.Put(searchurl) queuedict[searchurl] = true + ses := requests.NewSession() + ses.Config().SetTimeout(15) - for surl, ok := queue.Pop(); ok; surl, ok = queue.Pop() { + var surl interface{} + var ok bool + var debugsp *SearchProfile + var content []byte - ses := requests.NewSession() - resp, err := ses.Get(surl.(string)).Execute() + defer func() { + if ierr := recover(); ierr != nil { + log.Println(surl, debugsp) + f, _ := os.OpenFile("./error.html", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm) + f.Write(content) + f.Close() + log.Panic(ierr) + } + }() + + go func() { + signalchan := make(chan os.Signal) + signal.Notify(signalchan, syscall.SIGINT, syscall.SIGKILL, syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGSTOP) + log.Println("accept stop command:", <-signalchan) + f, _ := os.OpenFile("./error.html", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm) + f.Write(content) + f.Close() + os.Exit(1) + }() + + for surl, ok = queue.Pop(); ok; surl, ok = queue.Pop() { + u, err := url.Parse(surl.(string)) if err != nil { - panic(err) + log.Println(err) + continue } + resp, err := ses.Get(u.String()).Execute() + if err != nil { + log.Println(err) + log.Println(u.String(), surl) + continue + // log.Panic(err) + } + + content = resp.Content() etor := extractor.ExtractXml(resp.Content()) - - // doc, err := libxml2.ParseHTML(resp.Content()) - // if err != nil { - // panic(err) - // } - // defer doc.Free() - - result, err := etor.XPath("//*[contains(@class, 'tag')]/@href") + result, err := etor.XPath("//p[@class='taglist']/a[contains(@class, 'tag')]/@href") if err != nil { panic(err) } - // result, err := doc.Find("//*[contains(@class, 'tag')]/@href") - // if err != nil { - // panic(err) - // } - // defer result.Free() - iter := result.NodeIter() for iter.Next() { - wurl := "https://twitcasting.tv" + iter.Node().NodeValue() + wurl := homeurl + iter.Node().NodeValue() if ok := queuedict[wurl]; !ok { log.Println(wurl) sl := &intimate.StreamerList{} @@ -67,7 +113,9 @@ func TestMain(t *testing.T) { sl.Operator = 0 sl.UpdateInterval = 120 sl.UpdateTime = time.Now() + estore.InsertStreamerList(sl) + queue.Put(wurl) queuedict[wurl] = true } @@ -80,21 +128,53 @@ func TestMain(t *testing.T) { continue } - // xps.ForEachTag(SearchProfile{}) - - // texts, errs := xps.ForEachText(".//span[@class='username']") - // if len(errs) > 0 { - // t.Error(errs) - // } + log.Println("extract tag") var splist = xps.ForEachTag(SearchProfile{}) + log.Println("finish extract tag") for _, isp := range splist { sp := isp.(*SearchProfile) + if sp.LiveUrl == "" { + continue + } + sp.UserId = sp.LiveUrl[1:] + for i := 0; i < len(sp.TagUrl); i++ { + wurl := homeurl + sp.TagUrl[i] + sp.TagUrl[i] = wurl + if ok := queuedict[wurl]; !ok { + sl := &intimate.StreamerList{} + sl.Platform = intimate.Ptwitcasting + sl.Url = wurl + sl.Operator = 0 + sl.UpdateInterval = 120 + sl.UpdateTime = time.Now() + estore.InsertStreamerList(sl) + + queue.Put(wurl) + queuedict[wurl] = true + } + } // log.Println(sp.(SearchProfile)) } + log.Println("find user:", len(splist)) for _, isp := range splist { - log.Println(isp.(*SearchProfile)) + sp := isp.(*SearchProfile) + // log.Println(sp) + streamer := &intimate.Streamer{} + streamer.Platform = intimate.Ptwitcasting + streamer.LiveUrl = sql.NullString{String: sp.LiveUrl, Valid: true} + if btags, err := json.Marshal(sp.Tag); err != nil { + log.Println(err) + } else { + streamer.Tags = btags + } + streamer.UpdateInterval = 120 + streamer.UpdateTime = sql.NullTime{Time: time.Now(), Valid: true} + streamer.UserName = sql.NullString{String: sp.UserName, Valid: true} + streamer.UserId = sp.UserId + debugsp = sp + estore.InsertStreamer(streamer) } log.Println("finish remain", queue.Size()) @@ -102,7 +182,9 @@ func TestMain(t *testing.T) { } type SearchProfile struct { - UserName string `exp:".//span[@class='username']" method:"Text"` - UserId string // `exp:".//span[@class='fullname']" method:"Text"` - LiveUrl string `exp:".//div[@class='usertext']/a[@href]" method:"Attribute,href Value"` + UserName string `exp:".//span[@class='username']" method:"Text"` + UserId string // `exp:".//span[@class='fullname']" method:"Text"` + LiveUrl string `exp:".//div[@class='usertext']/a[@href]" method:"Attribute,href Value"` + Tag []string `exp:".//a[contains(@class, 'tag tag-mini')]" method:"Text"` + TagUrl []string `exp:".//a[contains(@class, 'tag tag-mini')]" method:"Attribute,href Value"` } From b82b4f5b774f5f24fa2ce25d941268c0acb3845b Mon Sep 17 00:00:00 2001 From: eson Date: Tue, 11 Aug 2020 18:26:17 +0800 Subject: [PATCH 5/6] finish tag slice --- .../openrec_extractor/openrec_extractor.go | 2 +- .../twitcasting_extractor.go | 140 +++ .../twitcasting_extractor_test.go | 12 + .../twitch_extractor/tiwtch_extractor.go | 11 +- extractor_field.go | 2 +- go.mod | 3 +- go.sum | 42 +- store.go | 2 +- .../twitcasting/twitcasting_task1/.gitignore | 2 + .../twitcasting/twitcasting_task1/error.html | 991 ++++++++++++++++++ tasks/twitcasting/twitcasting_task1/main.go | 4 + .../twitcasting_task1/main_test.go | 177 +--- .../twitcasting_task1/twitcasting.go | 140 +++ 13 files changed, 1317 insertions(+), 211 deletions(-) create mode 100644 extractor/twitcasting_extractor/twitcasting_extractor.go create mode 100644 extractor/twitcasting_extractor/twitcasting_extractor_test.go create mode 100644 tasks/twitcasting/twitcasting_task1/.gitignore create mode 100755 tasks/twitcasting/twitcasting_task1/error.html create mode 100644 tasks/twitcasting/twitcasting_task1/twitcasting.go diff --git a/extractor/openrec_extractor/openrec_extractor.go b/extractor/openrec_extractor/openrec_extractor.go index 3b689ad..41cf76d 100644 --- a/extractor/openrec_extractor/openrec_extractor.go +++ b/extractor/openrec_extractor/openrec_extractor.go @@ -87,7 +87,7 @@ func (oe *OpenrecExtractor) Execute() { streamer.UpdateTime = source.UpdateTime streamer.Tags = clog.Tags - clog.Platform = string(intimate.Popenrec) + clog.Platform = intimate.Popenrec clog.UserId = userId clog.UpdateTime = source.UpdateTime diff --git a/extractor/twitcasting_extractor/twitcasting_extractor.go b/extractor/twitcasting_extractor/twitcasting_extractor.go new file mode 100644 index 0000000..44e059f --- /dev/null +++ b/extractor/twitcasting_extractor/twitcasting_extractor.go @@ -0,0 +1,140 @@ +package main + +import ( + "database/sql" + "intimate" + "log" + "regexp" + "strconv" + "strings" + "time" + + "github.com/474420502/extractor" + "github.com/474420502/requests" +) + +// sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql +var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STOpenrec)) + +// estore 解析存储连接实例 +var estore *intimate.StoreExtractor = intimate.NewStoreExtractor() + +type LiveData struct { + UserName string `exp:"//span[@class='tw-live-author__info-username']" method:"Text"` + Follower string `exp:"(//span[@class='tw-user-nav-list-count'])[2]" method:"Text"` + MaxViews string `exp:"//span[@id='max_viewer_count']" method:"Text"` + LiveTitle string `exp:"//meta[@property='og:title']" method:"AttributeValue,content"` + LiveStart string `exp:"//span[@id='updatetimer']" method:"AttributeValue,data-started-at"` + LiveDuration string `exp:"//span[@id='updatetimer']" method:"AttributeValue,data-duration"` + Tags []string `exp:"//div[@class='tw-live-author__commandbox--tags']//a[@class='tag tag-info']"` +} + +func main() { + + ps := intimate.NewPerfectShutdown() + + for !ps.IsClose() { + + streamer, err := estore.Pop(intimate.Ptwitcasting) + if err != nil { + log.Println(err) + } + + ses := requests.NewSession() + resp, err := ses.Get("https://twitcasting.tv/kyunenee09").Execute() + if err != nil { + log.Panic(err) + } + var ldata *LiveData + etor := extractor.ExtractXml(resp.Content()) + ldata = etor.GetObjectByTag(LiveData{}).(*LiveData) + + ldata.MaxViews = regexp.MustCompile("\\d+").FindString(ldata.MaxViews) + ldata.LiveStart = ldata.LiveStart[:len(ldata.LiveStart)-3] + ldata.LiveDuration = ldata.LiveDuration[:len(ldata.LiveDuration)-3] + // log.Println(etor.GetObjectByTag(LiveData{})) + coincount := 0 + + for i := 0; ; i++ { + + giverurl := "https://twitcasting.tv/kyunenee09/backers/" + strconv.Itoa(i) + resp, err = ses.Get(giverurl).Execute() + if err != nil { + log.Panic(err) + } + etor := extractor.ExtractXml(resp.Content()) + xp, err := etor.XPaths("//td[@class='tw-memorial-table-recent-point']") + if err != nil { + log.Panic(err) + } + + coins := xp.GetTexts() + + for _, cointxt := range coins { + scointxt := strings.Split(cointxt, "/") + if len(scointxt) == 2 { + coin := strings.Trim(scointxt[1], " ") + c, err := strconv.Atoi(coin) + if err == nil { + coincount += c + } + log.Println(coin, coincount) + } else { + log.Println("coin error: ", cointxt) + } + } + + if len(coins) < 20 { + break + } + } + + streamer.Platform = intimate.Ptwitcasting + streamer.UpdateTime = sql.NullTime{Time: time.Now(), Valid: true} + streamer.UpdateInterval = 60 + streamer.UserName = sql.NullString{String: ldata.UserName, Valid: true} + streamer.Operator = 0 + + clog := &intimate.CollectLog{} + clog.UserId = streamer.UserId + clog.Gratuity = sql.NullInt64{Int64: int64(coincount), Valid: true} + clog.Platform = streamer.Platform + clog.UpdateTime = streamer.UpdateTime + clog.LiveTitle = sql.NullString{String: ldata.LiveTitle, Valid: true} + fl, err := strconv.Atoi(ldata.Follower) + if err == nil { + clog.Followers = sql.NullInt64{Int64: int64(fl), Valid: true} + } else { + log.Println(err) + } + + views, err := strconv.Atoi(ldata.MaxViews) + if err == nil { + clog.Views = sql.NullInt64{Int64: int64(views), Valid: true} + } else { + log.Println(err) + } + + st, err := strconv.Atoi(ldata.LiveStart) + if err == nil { + startTime := time.Unix(int64(st), 0) + clog.LiveStartTime = sql.NullTime{Time: startTime, Valid: true} + dt, err := strconv.Atoi(ldata.LiveDuration) + if err == nil { + + endTime := startTime.Add((time.Duration)(dt) * time.Second) + clog.LiveEndTime = sql.NullTime{Time: endTime, Valid: true} + } else { + log.Println(err) + } + + } else { + log.Println(err) + } + + streamer.LatestLogUid = estore.InsertClog(clog) + estore.UpdateStreamer(streamer) + + break + } +} diff --git a/extractor/twitcasting_extractor/twitcasting_extractor_test.go b/extractor/twitcasting_extractor/twitcasting_extractor_test.go new file mode 100644 index 0000000..2d4df4f --- /dev/null +++ b/extractor/twitcasting_extractor/twitcasting_extractor_test.go @@ -0,0 +1,12 @@ +package main + +import "testing" + +// type LiveData struct { +// UserName string `exp:".//span[@class='tw-live-author__info-username']" method:"Text"` +// Follower string `exp:".//span[@class='tw-user-nav-list-count']" method:"Text"` +// } + +func TestMain(t *testing.T) { + main() +} diff --git a/extractor/twitch_extractor/tiwtch_extractor.go b/extractor/twitch_extractor/tiwtch_extractor.go index bd2c6cf..187ae3e 100644 --- a/extractor/twitch_extractor/tiwtch_extractor.go +++ b/extractor/twitch_extractor/tiwtch_extractor.go @@ -6,6 +6,7 @@ import ( "intimate" "log" "regexp" + "strings" "time" "github.com/tebeka/selenium" @@ -48,6 +49,7 @@ func main() { var updateUrl map[string]string json.Unmarshal(streamer.UpdateUrl.([]byte), &updateUrl) liveUrl := updateUrl["live"] + liveUrl = strings.Replace(liveUrl, "/watchparty", "", -1) log.Println(liveUrl) // err = wd.Get("https://www.twitch.tv/zoe_0601" + "/about") @@ -67,6 +69,13 @@ func main() { time.Sleep(time.Millisecond * 500) err = extractUserName(wd, streamer) if err != nil { + _, err = wd.FindElement(selenium.ByXPATH, "//a[@data-a-target='browse-channels-button']") + if err == nil { + log.Println(streamer.UserId, "may be cancell") + streamer.Operator = 5 + streamer.UpdateTime = sql.NullTime{Time: time.Now(), Valid: true} + estore.UpdateStreamer(streamer) + } continue } err = extractFollowers(wd, clog) @@ -94,7 +103,7 @@ func main() { } streamer.Platform = intimate.Ptwitch - clog.Platform = string(streamer.Platform) + clog.Platform = streamer.Platform clog.UpdateTime = sql.NullTime{Time: time.Now(), Valid: true} lastClogId := estore.InsertClog(clog) diff --git a/extractor_field.go b/extractor_field.go index c091c5b..2977941 100644 --- a/extractor_field.go +++ b/extractor_field.go @@ -78,7 +78,7 @@ type CollectLog struct { LogUid int64 // 日志id StreamerUid int64 // StreamerId 表id与 - Platform string // + Platform Platform // UserId string // 平台的UserId IsLiveStreaming bool // IsError bool // diff --git a/go.mod b/go.mod index 406d3f9..35e0665 100644 --- a/go.mod +++ b/go.mod @@ -3,14 +3,13 @@ module intimate go 1.14 require ( - github.com/474420502/extractor v0.4.1 + github.com/474420502/extractor v0.5.2 github.com/474420502/focus v0.12.0 github.com/474420502/gcurl v0.1.2 github.com/474420502/hunter v0.3.4 github.com/474420502/requests v1.6.0 github.com/go-sql-driver/mysql v1.5.0 github.com/lestrrat-go/libxml2 v0.0.0-20200215080510-6483566f52cb - github.com/stretchr/testify v1.6.1 // indirect github.com/tebeka/selenium v0.9.9 github.com/tidwall/gjson v1.6.0 github.com/tidwall/pretty v1.0.1 // indirect diff --git a/go.sum b/go.sum index 119a3b6..32dceb6 100644 --- a/go.sum +++ b/go.sum @@ -2,44 +2,18 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= cloud.google.com/go v0.41.0/go.mod h1:OauMR7DV8fzvZIl2qg6rkaIhD/vmgk4iwEw/h6ercmg= -github.com/474420502/extractor v0.3.0 h1:VURhjNFP2kG6DvPZfsRR/3JLYHURvsHazp/JazNYbME= -github.com/474420502/extractor v0.3.0/go.mod h1:thq0UAm30cMLY6+LJHPNRSw/H3ZrMGfmK0rk+HwycvE= -github.com/474420502/extractor v0.3.1 h1:IxOeJziOR3DPrZJhOcbOUzAc/UABmKUYGLdVgxSi9yk= -github.com/474420502/extractor v0.3.1/go.mod h1:thq0UAm30cMLY6+LJHPNRSw/H3ZrMGfmK0rk+HwycvE= -github.com/474420502/extractor v0.3.2 h1:KcgRC0+pNfK803uZjL76pgsfsnlKSMR1nQX6o6y8cVA= -github.com/474420502/extractor v0.3.2/go.mod h1:yQRtpUOeb37tMitCsenURnN2Yas9Jm/5HGFDCO+/20k= -github.com/474420502/extractor v0.3.3 h1:2/rCOEtTVkezGqz7E0D8KKN1QBKlQaihe+UMxNZcwNk= -github.com/474420502/extractor v0.3.3/go.mod h1:8cakB/mW3No6o2I7PtrVHQ35auIgHh0mGIfk1++UZm4= -github.com/474420502/extractor v0.3.4 h1:3lKV5oke46sDAxkiY4KGMeBiYI8hwNkiAa2Sf8B+xPY= -github.com/474420502/extractor v0.3.4/go.mod h1:+biDin5eKLuJQHNbW+HnPYCC+2LL090iCZNxQklB11Y= -github.com/474420502/extractor v0.3.5 h1:uq3SuPY51F1pYvAtnaJtcqtJ+yE7wcaq3LP9DWTtBnQ= -github.com/474420502/extractor v0.3.5/go.mod h1:pKjqYQCZquakvor/d9JJQYrTYInWKaVXjzAg+IM1/tY= -github.com/474420502/extractor v0.3.6 h1:Qsky2YYUCENz3BFzlFOOWykFyDOfigbkkCTnMAkKExE= -github.com/474420502/extractor v0.3.6/go.mod h1:rH+/kx0CS8xpzOBqraisQE1A9vfXAPZZ+091D8HYXvw= -github.com/474420502/extractor v0.3.7 h1:QDBd4mAjf6D+vH98LQ1SJByDTtLago9GDiEvN1oyDJ0= -github.com/474420502/extractor v0.3.7/go.mod h1:v0TAfUw1zNyFCYVqj5xyFVFpoqmqErvAd2SzMzR/yc8= -github.com/474420502/extractor v0.4.0 h1:h6MbrkCBPQ2/+VRAK741oVcZuDhZ2t4USt0MOIf/v2U= -github.com/474420502/extractor v0.4.0/go.mod h1:1oPuXIm7whY+/rU7hxDW3ick4hHc4AdiNqdk5vVWaXs= -github.com/474420502/extractor v0.4.1 h1:WqcwF7gyvGREBrXBAm3fLR7yqxP/P/arq/iHXZvt8Gg= -github.com/474420502/extractor v0.4.1/go.mod h1:1oPuXIm7whY+/rU7hxDW3ick4hHc4AdiNqdk5vVWaXs= +github.com/474420502/extractor v0.5.1 h1:A1heJJSYbV9nEaUHfl3/1HYXcsBQfsTzAHikgwg2IF0= +github.com/474420502/extractor v0.5.1/go.mod h1:vkqsbi7wXPqyi5Q5dchcGjiaWHbgOJOAEcwonBiAs/E= +github.com/474420502/extractor v0.5.2 h1:ndgrAkxJjQg0Nrbq3AX2/xAnmIJNxSHRFGQ78wEtWj4= +github.com/474420502/extractor v0.5.2/go.mod h1:vkqsbi7wXPqyi5Q5dchcGjiaWHbgOJOAEcwonBiAs/E= github.com/474420502/focus v0.12.0 h1:+icbmj7IEOefvTegHt5EpcHt6WFbe2miIrceUJx2Evo= github.com/474420502/focus v0.12.0/go.mod h1:d0PMjtMxFz1a9HIhwyFPkWa+JF+0LgOrEUfd8iZka6s= github.com/474420502/gcurl v0.1.2 h1:ON9Yz3IgAdtDlFlHfkAJ3aIEBDxH0RiViPE5ST5ohKg= github.com/474420502/gcurl v0.1.2/go.mod h1:hws5q/Ao64bXLLDnldz9VyTQUndTWc/i5DzdEazFfoM= +github.com/474420502/htmlquery v1.2.4-0.20200810165859-a0e2c521c7c2 h1:4F1tpJ+sEkb3N+XD+Wb9MFiQmOMm3bHp8QUP+BQvkVk= +github.com/474420502/htmlquery v1.2.4-0.20200810165859-a0e2c521c7c2/go.mod h1:AoSN890esHwNKecV0tCs+W0ele1xgFL1Jqk6UcrdxgU= github.com/474420502/hunter v0.3.4 h1:fyLAgI84jWe3IcqsISC53j1w3CXI1FERxX//Potns0M= github.com/474420502/hunter v0.3.4/go.mod h1:pe4Xr/I+2agvq339vS/OZV+EiHAWtpXQs75rioSW9oA= -github.com/474420502/libxml2 v0.0.0-20200806111302-aa4be92ad592 h1:kgvx2MvoMhkrzLVjM6C6RIcshgI80fnq5/LqAnTOMxQ= -github.com/474420502/libxml2 v0.0.0-20200806111302-aa4be92ad592/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= -github.com/474420502/libxml2 v0.0.0-20200807033034-1b43ad443d1d h1:MQduBAgnOCeGVUU+tawJxQLP1/Bgnn7119hGpVb9VFI= -github.com/474420502/libxml2 v0.0.0-20200807033034-1b43ad443d1d/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= -github.com/474420502/libxml2 v0.0.0-20200807033649-9731e0a44bf0 h1:EiO+pSoFk7TTv/TnVFCT/swjWQEeLAZ2wXeXsS+9+kY= -github.com/474420502/libxml2 v0.0.0-20200807033649-9731e0a44bf0/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= -github.com/474420502/libxml2 v0.0.0-20200807034854-eaa2a69a2790 h1:vzHGXv0e7MX+MSZcz4SjRJUfzoUpX96Qf0f48T6dkxk= -github.com/474420502/libxml2 v0.0.0-20200807034854-eaa2a69a2790/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= -github.com/474420502/libxml2 v0.0.0-20200807035356-cd2e51185f4b h1:q9qSCx9gm7gS6Xr2nmKqkiu2FApQJFkqvTsrAzcWXps= -github.com/474420502/libxml2 v0.0.0-20200807035356-cd2e51185f4b/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= -github.com/474420502/libxml2 v0.0.0-20200807040518-4ef6186ae68c h1:UZriMoPoXEA4Mq/yP+36sxwkOC3Jk3nqy2I7e3ZV470= -github.com/474420502/libxml2 v0.0.0-20200807040518-4ef6186ae68c/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34= github.com/474420502/requests v1.6.0 h1:f4h4j40eT0P5whhg9LdkotD8CaKjtuDu/vz9iSUkCgY= github.com/474420502/requests v1.6.0/go.mod h1:SLXrQ5dL9c7dkIeKNUCBAjOIt3J9KFCS2RQjWJecNwo= github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= @@ -52,9 +26,8 @@ github.com/Pallinder/go-randomdata v1.1.0 h1:gUubB1IEUliFmzjqjhf+bgkg1o6uoFIkRsP github.com/Pallinder/go-randomdata v1.1.0/go.mod h1:yHmJgulpD2Nfrm0cR9tI/+oAgRqCQQixsA8HyRZfV9Y= github.com/Pallinder/go-randomdata v1.2.0 h1:DZ41wBchNRb/0GfsePLiSwb0PHZmT67XY00lCDlaYPg= github.com/Pallinder/go-randomdata v1.2.0/go.mod h1:yHmJgulpD2Nfrm0cR9tI/+oAgRqCQQixsA8HyRZfV9Y= -github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0= +github.com/antchfx/xpath v1.1.6 h1:6sVh6hB5T6phw1pFpHRQ+C4bd8sNI+O58flqtg7h0R0= github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= -github.com/antchfx/xpath v1.1.10/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/blang/semver v3.5.1+incompatible h1:cQNTCjp13qL8KC3Nbxr/y2Bqb63oX6wdnnjpJbkM4JQ= @@ -71,6 +44,7 @@ github.com/go-sql-driver/mysql v1.5.0 h1:ozyZYNQW3x3HtqT1jira07DN2PArx2v7/mN66gG github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b h1:VKtxabqXZkF25pY9ekfRL6a582T4P37/31XEstQ5p58= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY= github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= diff --git a/store.go b/store.go index e83718f..da79aa2 100644 --- a/store.go +++ b/store.go @@ -346,7 +346,7 @@ func (store *StoreExtractor) InsertStreamer(streamer IGet) (isExists bool) { return true } - _, err = tx.Exec("INSERT INTO "+StreamerTable+"(platform, user_id, update_url, tags, update_time) VALUES(?,?,?,?,?);", streamer.Get("Platform"), streamer.Get("UserId"), streamer.Get("UpdateUrl"), streamer.Get("Tags"), time.Now().Add(-time.Minute*60)) + _, err = tx.Exec("INSERT INTO "+StreamerTable+"(platform, user_id, update_url, tags, update_time) VALUES(?,?,?,?,?);", streamer.Get("Platform"), streamer.Get("UserId"), streamer.Get("UpdateUrl"), streamer.Get("Tags"), time.Now().Add(-time.Hour*100000)) if err != nil { panic(err) } diff --git a/tasks/twitcasting/twitcasting_task1/.gitignore b/tasks/twitcasting/twitcasting_task1/.gitignore new file mode 100644 index 0000000..de4f65a --- /dev/null +++ b/tasks/twitcasting/twitcasting_task1/.gitignore @@ -0,0 +1,2 @@ +twitcasting_task1 +log \ No newline at end of file diff --git a/tasks/twitcasting/twitcasting_task1/error.html b/tasks/twitcasting/twitcasting_task1/error.html new file mode 100755 index 0000000..3a41b64 --- /dev/null +++ b/tasks/twitcasting/twitcasting_task1/error.html @@ -0,0 +1,991 @@ + + + + Live with Tag: vocaloid - TwitCasting + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+ + + + +
+ +
+ +
+
+ +

+ Tag Search : + vocaloid

+ + + Live(0) / + User(28) + + + + + + +
+
+ +
+
+
+
+
+
+ + + +
+
+ + 月詠來夢@きゃす + +
@LhymeCyas
+
+
+
月詠來夢のきゃす垢
+ +
+
+
+
+
+
+
+ + + +
+
+ + SHIRATAMA + +
@c:shiratama_dango
+
+
+
A creature which enjoy singing, humming while walking, and eating.
+ +
+
+
+
+
+
+
+ + + +
+
+ + ずんちゃ + +
@zunguri1459
+
+
+
はろはわゆ
+ +
+
+
+
+
+
+
+ + + +
+
+ + したばま ましか + +
@kyabet001
+
+
+
2Dホラーアクションシューティング「CARLA」を製作中です PV: https://t.co/PXBDoaNFUV よかったらフォロ/フォロバオナシャス
+ +
+
+
+
+
+
+
+ + + +
+
+ + ぐろぐ + +
@c:grog
+
+
+
+ +
+
+
+
+
+
+
+ + + +
+
+ + んぁゆ (NaYu) + +
@NaYu_NotYou
+
+
+
🇮🇩🛫🇦🇺 || Main Account || Korean Related: @NaYu_ha || Pull me into more hells- || icon: @AyaminTwT
+ +
+
+
+
+
+
+
+ + + +
+
+ + このは@モンハン勢 + +
@0konoha39
+
+
+
DIVA、白猫、モンハンが大好きな大学生です(((((└(:D」┌)┘)))))))
+ +
+
+
+
+
+
+
+ + + +
+
+ + f:Egao No ShouJou + +
@f:100000242133373
+
+
+
+ +
+
+
+
+
+
+
+ + + +
+
+ + みゆ + +
@rubierin_sna
+
+
+
sing! やってます✨
+ +
+
+
+
+
+
+
+ + + +
+
+ + f:Yuu'll Be Back + +
@f:100004051574775
+
+
+
+ +
+
+
+
+
+
+
+ + + +
+
+ + Mongdang + +
@c:Mongdang
+
+
+
+ +
+
+
+
+
+
+
+ + + +
+
+ + 狼谷ありー + +
@kamiy_ari
+
+
+
歌が大好きです。
+ +
+
+
+
+
+
+
+ + + +
+
+ + f:Thamanan Wittayachamnankul + +
@f:100000714321056
+
+
+
อ่าา ชอบเกรียนที่สุดเบยย อ๊ายยย ขอให้ได้เกรียน เกรียนในสิ่งที่ชอบ และไม่เดือดร้อนคนอื่น โอเคโน๊ะ อยากจะเกรียนเพลงใส่กันเชิญจ้า กร๊ากก ผมเกรียนได้ทุกคนอยู่แระ ..
+ +
+
+
+
+
+
+
+ + + +
+
+ + hika + +
@hikamody
+
+
+
성인/잡덕/Bi🌈/Feminist/Flexitarian
+ +
+
+
+
+
+
+
+ + + +
+
+ + totally katië 123 + +
@katie_diva_xoxo
+
+
+
❤️❤️❤️zoella
+ +
+
+
+
+
+
+
+ + + +
+
+ + Fujisaki Hitomi + +
@c:vlemvpe
+
+
+
Fujisaki Hitomi +https://www.facebook.com/ciazfah2 +คนไทยค่า :D
+ +
+
+
+
+
+
+
+ + + +
+
+ + 初音@yu卍nan + +
@c:hatune3625
+
+
+
初音ミク大好き
+ +
+
+
+
+
+
+
+ + + +
+
+ + Sarah + +
@c:sara21ren
+
+
+
Hello. It's Sarah (: I play the piano and the guitar. Sometimes sing. Ttm hmu!!
+ +
+
+
+
+
+
+
+ + + +
+
+ + カワズ先輩≠カエル先輩 + +
@kawazu1816
+
+
+
愛したって、愛されたっていいじゃないか
+ +
+
+
+
+
+
+
+ + + +
+
+ + ヴァレン(実況者) + +
@varenturu
+
+
+
不定期で実況動画あげていく予定です。 よろしくお願いします!! 異常なほどな誤字脱字etc...沢山のハプニングがあると思います。 温かい目で見てやってくれればありが..
+ +
+
+
+
+
+
+
+ + + +
+
+ + BARI + +
@c:iasoa020
+
+
+
歌を歌います。少しでもプラスに慣れればと思います。
+ +
+
+
+
+
+
+
+ + + +
+
+ + f:Xio Steph + +
@f:100007528892324
+
+
+
+ +
+
+
+
+
+
+
+ + + +
+
+ + f:Kama Jewell Greco + +
@f:489672901190991
+
+
+
+ +
+
+
+
+
+
+
+ + + +
+
+ + 姫柳/かんじむずかしい + +
@Kiryu_Hanabusa
+
+
+
YouTuberになるために準備中です。 基本受けの体制なので話し掛けてもらうまでROMってることが多いです。気軽にお声掛けください! codevein/競馬/麻雀/雀魂/FPS #姫柳の..
+ +
+
+
+
+ +
ただのザコなボカロファンです。
+ +
+
+
+
+
+
+
+ + + +
+
+ + (๑•̀ω•́๑) + +
@porkyuupine
+
+
+
❤ VOCALOID | 歌い手 | アニメと漫画 ❤ シンガポールからのファンです、よろしく〜☆  
+ +
+
+
+
+
+
+
+ + + +
+
+ + miming(みみん) + +
@c:mimingdayo
+
+
+
韓国人です。日本語勉強はじめたのは10年くらい前からで +留学はしたことありません。 +まぁ気ままにやります。 +ヾ(⌒(_*'ω'*)_
+ +
+
+
+ + + + + +
+ + + + + +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tasks/twitcasting/twitcasting_task1/main.go b/tasks/twitcasting/twitcasting_task1/main.go index 06ab7d0..736ef31 100644 --- a/tasks/twitcasting/twitcasting_task1/main.go +++ b/tasks/twitcasting/twitcasting_task1/main.go @@ -1 +1,5 @@ package main + +func main() { + Execute() +} diff --git a/tasks/twitcasting/twitcasting_task1/main_test.go b/tasks/twitcasting/twitcasting_task1/main_test.go index 7acc76b..18bfb6a 100644 --- a/tasks/twitcasting/twitcasting_task1/main_test.go +++ b/tasks/twitcasting/twitcasting_task1/main_test.go @@ -1,26 +1,9 @@ package main import ( - "database/sql" - "encoding/json" - "intimate" - "net/http" "net/url" - "os" - "os/signal" - "syscall" - "time" - "github.com/474420502/extractor" - "github.com/474420502/focus/compare" - "github.com/474420502/focus/tree/heap" - - "log" "testing" - - _ "net/http/pprof" - - "github.com/474420502/requests" ) func Test(t *testing.T) { @@ -30,161 +13,13 @@ func Test(t *testing.T) { t.Error(u.String()) } -// sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql -var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STTwitcasting)) - -// estore 解析存储连接实例 -var estore *intimate.StoreExtractor = intimate.NewStoreExtractor() +func TestUpdateTime(t *testing.T) { + // streamer := &intimate.Streamer{} + // streamer.Uid = 420153 + // streamer.UpdateTime = sql.NullTime{Time: time.Now(), Valid: true} + // estore.Update(streamer, "update_time", streamer.UpdateTime) +} func TestMain(t *testing.T) { - f, _ := os.OpenFile("./log", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm) - log.SetFlags(log.Llongfile | log.Ltime) - log.SetOutput(f) - go func() { - log.Println(http.ListenAndServe(":4040", nil)) - }() - - homeurl := "https://twitcasting.tv" - searchurl := "https://twitcasting.tv/rankingindex.php" - queuedict := make(map[string]bool) - queue := heap.New(compare.String) - queue.Put(searchurl) - queuedict[searchurl] = true - ses := requests.NewSession() - ses.Config().SetTimeout(15) - - var surl interface{} - var ok bool - var debugsp *SearchProfile - var content []byte - - defer func() { - if ierr := recover(); ierr != nil { - log.Println(surl, debugsp) - f, _ := os.OpenFile("./error.html", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm) - f.Write(content) - f.Close() - log.Panic(ierr) - } - }() - - go func() { - signalchan := make(chan os.Signal) - signal.Notify(signalchan, syscall.SIGINT, syscall.SIGKILL, syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGSTOP) - log.Println("accept stop command:", <-signalchan) - f, _ := os.OpenFile("./error.html", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm) - f.Write(content) - f.Close() - os.Exit(1) - }() - - for surl, ok = queue.Pop(); ok; surl, ok = queue.Pop() { - u, err := url.Parse(surl.(string)) - if err != nil { - log.Println(err) - continue - } - - resp, err := ses.Get(u.String()).Execute() - if err != nil { - log.Println(err) - log.Println(u.String(), surl) - continue - // log.Panic(err) - } - - content = resp.Content() - etor := extractor.ExtractXml(resp.Content()) - result, err := etor.XPath("//p[@class='taglist']/a[contains(@class, 'tag')]/@href") - if err != nil { - panic(err) - } - - iter := result.NodeIter() - for iter.Next() { - - wurl := homeurl + iter.Node().NodeValue() - if ok := queuedict[wurl]; !ok { - log.Println(wurl) - sl := &intimate.StreamerList{} - sl.Platform = intimate.Ptwitcasting - sl.Url = wurl - sl.Operator = 0 - sl.UpdateInterval = 120 - sl.UpdateTime = time.Now() - - estore.InsertStreamerList(sl) - - queue.Put(wurl) - queuedict[wurl] = true - } - } - - // doc.Find("//div[@class='tw-search-result-row']") - xps, err := etor.XPaths("//div[@class='tw-search-result-row']") - if err != nil { - log.Println(surl, err) - continue - } - - log.Println("extract tag") - var splist = xps.ForEachTag(SearchProfile{}) - log.Println("finish extract tag") - for _, isp := range splist { - sp := isp.(*SearchProfile) - if sp.LiveUrl == "" { - continue - } - - sp.UserId = sp.LiveUrl[1:] - for i := 0; i < len(sp.TagUrl); i++ { - wurl := homeurl + sp.TagUrl[i] - sp.TagUrl[i] = wurl - if ok := queuedict[wurl]; !ok { - sl := &intimate.StreamerList{} - sl.Platform = intimate.Ptwitcasting - sl.Url = wurl - sl.Operator = 0 - sl.UpdateInterval = 120 - sl.UpdateTime = time.Now() - estore.InsertStreamerList(sl) - - queue.Put(wurl) - queuedict[wurl] = true - } - } - // log.Println(sp.(SearchProfile)) - } - - log.Println("find user:", len(splist)) - for _, isp := range splist { - sp := isp.(*SearchProfile) - // log.Println(sp) - streamer := &intimate.Streamer{} - streamer.Platform = intimate.Ptwitcasting - streamer.LiveUrl = sql.NullString{String: sp.LiveUrl, Valid: true} - if btags, err := json.Marshal(sp.Tag); err != nil { - log.Println(err) - } else { - streamer.Tags = btags - } - streamer.UpdateInterval = 120 - streamer.UpdateTime = sql.NullTime{Time: time.Now(), Valid: true} - streamer.UserName = sql.NullString{String: sp.UserName, Valid: true} - streamer.UserId = sp.UserId - debugsp = sp - estore.InsertStreamer(streamer) - } - - log.Println("finish remain", queue.Size()) - } -} - -type SearchProfile struct { - UserName string `exp:".//span[@class='username']" method:"Text"` - UserId string // `exp:".//span[@class='fullname']" method:"Text"` - LiveUrl string `exp:".//div[@class='usertext']/a[@href]" method:"Attribute,href Value"` - Tag []string `exp:".//a[contains(@class, 'tag tag-mini')]" method:"Text"` - TagUrl []string `exp:".//a[contains(@class, 'tag tag-mini')]" method:"Attribute,href Value"` } diff --git a/tasks/twitcasting/twitcasting_task1/twitcasting.go b/tasks/twitcasting/twitcasting_task1/twitcasting.go new file mode 100644 index 0000000..58e2cba --- /dev/null +++ b/tasks/twitcasting/twitcasting_task1/twitcasting.go @@ -0,0 +1,140 @@ +package main + +import ( + "database/sql" + "encoding/json" + "intimate" + "log" + "net/url" + "time" + + "github.com/474420502/extractor" + "github.com/474420502/focus/compare" + "github.com/474420502/focus/tree/heap" + "github.com/474420502/requests" +) + +// sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql +var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STTwitcasting)) + +// estore 解析存储连接实例 +var estore *intimate.StoreExtractor = intimate.NewStoreExtractor() + +type SearchProfile struct { + UserName string `exp:".//span[@class='username']" method:"Text"` + UserId string // `exp:".//span[@class='fullname']" method:"Text"` + LiveUrl string `exp:".//div[@class='usertext']/a[@href]" method:"AttributeValue,href"` + Tag []string `exp:".//a[contains(@class, 'tag tag-mini')]" method:"Text"` + TagUrl []string `exp:".//a[contains(@class, 'tag tag-mini')]" method:"AttributeValue,href"` +} + +func Execute() { + homeurl := "https://twitcasting.tv" + searchurl := "https://twitcasting.tv/rankingindex.php" + queuedict := make(map[string]bool) + queue := heap.New(compare.String) + queue.Put(searchurl) + queuedict[searchurl] = true + ses := requests.NewSession() + ses.Config().SetTimeout(15) + + var surl interface{} + var ok bool + + ps := intimate.NewPerfectShutdown() + + for surl, ok = queue.Pop(); ok && !ps.IsClose(); surl, ok = queue.Pop() { + u, err := url.Parse(surl.(string)) + if err != nil { + log.Println(err) + continue + } + + resp, err := ses.Get(u.String()).Execute() + if err != nil { + log.Println(err) + log.Println(u.String(), surl) + continue + // log.Panic(err) + } + + etor := extractor.ExtractXml(resp.Content()) + result, err := etor.XPaths("//p[@class='taglist']/a[contains(@class, 'tag')]/@href") + if err != nil { + panic(err) + } + + for _, href := range result.GetTexts() { + + wurl := homeurl + href + if ok := queuedict[wurl]; !ok { + log.Println(wurl) + sl := &intimate.StreamerList{} + sl.Platform = intimate.Ptwitcasting + sl.Url = wurl + sl.Operator = 0 + sl.UpdateInterval = 120 + sl.UpdateTime = time.Now() + + estore.InsertStreamerList(sl) + + queue.Put(wurl) + queuedict[wurl] = true + } + } + + xps, err := etor.XPaths("//div[@class='tw-search-result-row']") + if err != nil { + log.Println(surl, err) + continue + } + + var splist = xps.ForEachTag(SearchProfile{}) + for _, isp := range splist { + sp := isp.(*SearchProfile) + if sp.LiveUrl == "" { + continue + } + + sp.UserId = sp.LiveUrl[1:] + for i := 0; i < len(sp.TagUrl); i++ { + wurl := homeurl + sp.TagUrl[i] + sp.TagUrl[i] = wurl + if ok := queuedict[wurl]; !ok { + sl := &intimate.StreamerList{} + sl.Platform = intimate.Ptwitcasting + sl.Url = wurl + sl.Operator = 0 + sl.UpdateInterval = 120 + sl.UpdateTime = time.Now() + estore.InsertStreamerList(sl) + + queue.Put(wurl) + queuedict[wurl] = true + } + } + // log.Println(sp.(SearchProfile)) + } + + log.Println("find user:", len(splist)) + for _, isp := range splist { + sp := isp.(*SearchProfile) + // log.Println(sp) + streamer := &intimate.Streamer{} + streamer.Platform = intimate.Ptwitcasting + streamer.LiveUrl = sql.NullString{String: sp.LiveUrl, Valid: true} + if btags, err := json.Marshal(sp.Tag); err != nil { + log.Println(err) + } else { + streamer.Tags = btags + } + streamer.UpdateInterval = 120 + streamer.UpdateTime = sql.NullTime{Time: time.Now(), Valid: true} + streamer.UserName = sql.NullString{String: sp.UserName, Valid: true} + streamer.UserId = sp.UserId + estore.InsertStreamer(streamer) + } + + log.Println("finish remain", queue.Size()) + } +} From 16c3ed93b7285468c3d27178c04e91a6b3e34474 Mon Sep 17 00:00:00 2001 From: eson Date: Wed, 12 Aug 2020 12:22:46 +0800 Subject: [PATCH 6/6] finish twitcasting --- extractor/twitcasting_extractor/.gitignore | 4 ++ .../twitcasting_extractor.go | 67 +++++++++++-------- .../twitcasting_extractor_test.go | 13 +++- utils.go | 21 ++++++ 4 files changed, 75 insertions(+), 30 deletions(-) create mode 100644 extractor/twitcasting_extractor/.gitignore diff --git a/extractor/twitcasting_extractor/.gitignore b/extractor/twitcasting_extractor/.gitignore new file mode 100644 index 0000000..dea3a6c --- /dev/null +++ b/extractor/twitcasting_extractor/.gitignore @@ -0,0 +1,4 @@ +*.html +log +screenlog.* +twitcasting_extractor \ No newline at end of file diff --git a/extractor/twitcasting_extractor/twitcasting_extractor.go b/extractor/twitcasting_extractor/twitcasting_extractor.go index 44e059f..0550e6a 100644 --- a/extractor/twitcasting_extractor/twitcasting_extractor.go +++ b/extractor/twitcasting_extractor/twitcasting_extractor.go @@ -24,40 +24,39 @@ type LiveData struct { Follower string `exp:"(//span[@class='tw-user-nav-list-count'])[2]" method:"Text"` MaxViews string `exp:"//span[@id='max_viewer_count']" method:"Text"` LiveTitle string `exp:"//meta[@property='og:title']" method:"AttributeValue,content"` - LiveStart string `exp:"//span[@id='updatetimer']" method:"AttributeValue,data-started-at"` + LiveStart string `exp:"//time[@data-kind='relative']" method:"AttributeValue,datetime"` LiveDuration string `exp:"//span[@id='updatetimer']" method:"AttributeValue,data-duration"` - Tags []string `exp:"//div[@class='tw-live-author__commandbox--tags']//a[@class='tag tag-info']"` + Tags []string `exp:"//div[@class='tw-live-author__commandbox--tags']//a[@class='tag tag-info']" method:"Text"` } func main() { ps := intimate.NewPerfectShutdown() + ses := requests.NewSession() for !ps.IsClose() { streamer, err := estore.Pop(intimate.Ptwitcasting) if err != nil { - log.Println(err) + log.Println(err, streamer.UserId) } - ses := requests.NewSession() - resp, err := ses.Get("https://twitcasting.tv/kyunenee09").Execute() + streamer.LiveUrl = sql.NullString{String: "https://twitcasting.tv/" + streamer.UserId, Valid: true} + resp, err := ses.Get(streamer.LiveUrl.String).Execute() if err != nil { - log.Panic(err) + estore.UpdateError(streamer, err) + log.Println(err, streamer.UserId) + continue } var ldata *LiveData etor := extractor.ExtractXml(resp.Content()) ldata = etor.GetObjectByTag(LiveData{}).(*LiveData) - ldata.MaxViews = regexp.MustCompile("\\d+").FindString(ldata.MaxViews) - ldata.LiveStart = ldata.LiveStart[:len(ldata.LiveStart)-3] - ldata.LiveDuration = ldata.LiveDuration[:len(ldata.LiveDuration)-3] - // log.Println(etor.GetObjectByTag(LiveData{})) coincount := 0 for i := 0; ; i++ { - giverurl := "https://twitcasting.tv/kyunenee09/backers/" + strconv.Itoa(i) + giverurl := streamer.LiveUrl.String + "/backers/" + strconv.Itoa(i) resp, err = ses.Get(giverurl).Execute() if err != nil { log.Panic(err) @@ -69,7 +68,6 @@ func main() { } coins := xp.GetTexts() - for _, cointxt := range coins { scointxt := strings.Split(cointxt, "/") if len(scointxt) == 2 { @@ -78,7 +76,7 @@ func main() { if err == nil { coincount += c } - log.Println(coin, coincount) + // log.Println(coin, coincount) } else { log.Println("coin error: ", cointxt) } @@ -91,19 +89,28 @@ func main() { streamer.Platform = intimate.Ptwitcasting streamer.UpdateTime = sql.NullTime{Time: time.Now(), Valid: true} - streamer.UpdateInterval = 60 streamer.UserName = sql.NullString{String: ldata.UserName, Valid: true} - streamer.Operator = 0 - + streamer.Operator = 10 + // streamer.UpdateInterval = 60 clog := &intimate.CollectLog{} clog.UserId = streamer.UserId clog.Gratuity = sql.NullInt64{Int64: int64(coincount), Valid: true} clog.Platform = streamer.Platform clog.UpdateTime = streamer.UpdateTime clog.LiveTitle = sql.NullString{String: ldata.LiveTitle, Valid: true} - fl, err := strconv.Atoi(ldata.Follower) + fl, err := intimate.ParseNumberEx(ldata.Follower) if err == nil { clog.Followers = sql.NullInt64{Int64: int64(fl), Valid: true} + switch { + case fl <= 100: + streamer.UpdateInterval = 360 + case fl <= 1000: + streamer.UpdateInterval = 240 + case fl <= 100: + streamer.UpdateInterval = 120 + default: + streamer.UpdateInterval = 60 + } } else { log.Println(err) } @@ -112,29 +119,31 @@ func main() { if err == nil { clog.Views = sql.NullInt64{Int64: int64(views), Valid: true} } else { - log.Println(err) + clog.Views = sql.NullInt64{Int64: int64(0), Valid: true} + // log.Println(err, streamer.UserId) } - st, err := strconv.Atoi(ldata.LiveStart) + // st, err := strconv.Atoi(ldata.LiveStart) + st, err := time.Parse("Mon, 02 Jan 2006 15:04:05 -0700", ldata.LiveStart) if err == nil { - startTime := time.Unix(int64(st), 0) + startTime := st clog.LiveStartTime = sql.NullTime{Time: startTime, Valid: true} dt, err := strconv.Atoi(ldata.LiveDuration) - if err == nil { - - endTime := startTime.Add((time.Duration)(dt) * time.Second) - clog.LiveEndTime = sql.NullTime{Time: endTime, Valid: true} - } else { - log.Println(err) + if time.Now().Sub(startTime) >= time.Hour*24*90 { + streamer.Operator = 5 } + if err == nil { + endTime := startTime.Add((time.Duration)(dt) * time.Millisecond) + clog.LiveEndTime = sql.NullTime{Time: endTime, Valid: true} + } else { + log.Println(err, streamer.UserId) + } } else { - log.Println(err) + log.Println(err, streamer.UserId) } streamer.LatestLogUid = estore.InsertClog(clog) estore.UpdateStreamer(streamer) - - break } } diff --git a/extractor/twitcasting_extractor/twitcasting_extractor_test.go b/extractor/twitcasting_extractor/twitcasting_extractor_test.go index 2d4df4f..811b2d3 100644 --- a/extractor/twitcasting_extractor/twitcasting_extractor_test.go +++ b/extractor/twitcasting_extractor/twitcasting_extractor_test.go @@ -1,6 +1,9 @@ package main -import "testing" +import ( + "testing" + "time" +) // type LiveData struct { // UserName string `exp:".//span[@class='tw-live-author__info-username']" method:"Text"` @@ -10,3 +13,11 @@ import "testing" func TestMain(t *testing.T) { main() } + +func TestDateFormat(t *testing.T) { + df := "Sat, 09 Sep 2017 18:19:17 +0900" + + if _, err := time.Parse("Mon, 02 Jan 2006 15:04:05 -0700", df); err != nil { + t.Error(err) + } +} diff --git a/utils.go b/utils.go index bad4a08..cb7d782 100644 --- a/utils.go +++ b/utils.go @@ -34,6 +34,27 @@ func ParseNumber(number string) (int64, error) { return strconv.ParseInt(number, 10, 64) } +// ParseNumberEx 解析带字符的数字 +func ParseNumberEx(num string) (float64, error) { + num = strings.Trim(num, " ") + last := num[len(num)-1] + factor := 1.0 + switch { + case last == 'k' || last == 'K': + factor = 1000.0 + num = num[0 : len(num)-1] + case last == 'm' || last == 'M': + factor = 1000000.0 + num = num[0 : len(num)-1] + } + i, err := strconv.ParseFloat(num, 64) + if err != nil { + return 0, err + } + + return i * factor, nil +} + // ParseDuration time to duration eg: 1:40:00 -> time.Duration func ParseDuration(dt string) (time.Duration, error) {