From 75b1d9fb774ff782e6b567b1a14bbad9e066e03b Mon Sep 17 00:00:00 2001 From: eson Date: Wed, 15 Jul 2020 15:44:21 +0800 Subject: [PATCH] =?UTF-8?q?TODO:=20=E5=AE=8C=E5=96=84`=E4=B8=BB=E6=92=AD?= =?UTF-8?q?=E8=A1=A8`=E7=9A=84=E6=9B=B4=E6=96=B0=E6=96=B9=E5=BC=8F.=20?= =?UTF-8?q?=E5=AD=98=E5=9C=A8=E6=9B=B4=E6=96=B0=E6=9C=80=E6=96=B0=E7=8A=B6?= =?UTF-8?q?=E6=80=81=E6=97=B6,=20=E4=BC=9A=E8=A2=AB`=E6=94=B9=E7=89=88`?= =?UTF-8?q?=E7=B1=BB=E4=BC=BC=E6=83=85=E5=86=B5=E8=A6=86=E7=9B=96,=20?= =?UTF-8?q?=E4=BB=8E=E8=80=8C=E6=89=BE=E4=B8=8D=E5=88=B0=E5=8E=9F=E4=BF=A1?= =?UTF-8?q?=E6=81=AF=E6=95=B0=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extractor/openrec/openrec_test.go | 219 +++++++++++--------- go.mod | 1 + sql/intimate_extractor.sql | 8 +- sql/intimate_source.sql | 2 +- store.go | 6 +- tasks/openrec/openrec_task2/task_openrec.go | 2 +- utils.go | 28 +++ 7 files changed, 163 insertions(+), 103 deletions(-) create mode 100644 utils.go diff --git a/extractor/openrec/openrec_test.go b/extractor/openrec/openrec_test.go index bb73e3c..cc2cf15 100644 --- a/extractor/openrec/openrec_test.go +++ b/extractor/openrec/openrec_test.go @@ -4,85 +4,85 @@ import ( "database/sql" "encoding/json" "intimate" + "io/ioutil" "log" "os" "regexp" "strconv" "strings" "testing" + "time" "github.com/474420502/hunter" "github.com/474420502/requests" + "github.com/lestrrat-go/libxml2" "github.com/tidwall/gjson" ) -func preNUm(data byte) int { - var mask byte = 0x80 - var num int = 0 - //8bit中首个0bit前有多少个1bits - for i := 0; i < 8; i++ { - if (data & mask) == mask { - num++ - mask = mask >> 1 - } else { - break - } - } - return num -} -func isUtf8(data []byte) bool { - i := 0 - for i < len(data) { - if (data[i] & 0x80) == 0x00 { - // 0XXX_XXXX - i++ - continue - } else if num := preNUm(data[i]); num > 2 { - // 110X_XXXX 10XX_XXXX - // 1110_XXXX 10XX_XXXX 10XX_XXXX - // 1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX - // 1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX - // 1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX - // preNUm() 返回首个字节的8个bits中首个0bit前面1bit的个数,该数量也是该字符所使用的字节数 - i++ - for j := 0; j < num-1; j++ { - //判断后面的 num - 1 个字节是不是都是10开头 - if (data[i] & 0xc0) != 0x80 { - return false - } - i++ - } - } else { - //其他情况说明不是utf-8 - return false - } - } - return true +func TestCase1(t *testing.T) { + date := "2020-07-13T18:58:24+09:00" + + tm, err := time.Parse("2006-01-02T15:04:05Z07:00", date) + t.Error(err) + t.Error(time.Now()) + t.Error(tm.Local().UTC(), tm.Local()) + } -func isGBK(data []byte) bool { - length := len(data) - var i int = 0 - for i < length { - if data[i] <= 0x7f { - //编码0~127,只有一个字节的编码,兼容ASCII码 - i++ - continue - } else { - //大于127的使用双字节编码,落在gbk编码范围内的字符 - if data[i] >= 0x81 && - data[i] <= 0xfe && - data[i+1] >= 0x40 && - data[i+1] <= 0xfe && - data[i+1] != 0xf7 { - i += 2 - continue - } else { - return false - } - } +func TestCase2(t *testing.T) { + duration1 := "0:00:00" + duration2 := "4:56:04" + tm2, err := time.Parse("15:04:05", duration2) + tm1, err := time.Parse("15:04:05", duration1) + + tm2.Sub(tm1) + + t.Error(err) + t.Error(tm2.Sub(tm1)) + +} + +func TestCase(t *testing.T) { + f, _ := os.Open("./test.html") + data, _ := ioutil.ReadAll(f) + + doc, err := libxml2.ParseHTML(data) + if err != nil { + panic(err) } - return true + // doc.CreateElement("meta") + // "" + + xresult, err := doc.Find("/html/head") + ele, err := doc.CreateElement(`META`) + + if err != nil { + panic(err) + } + ele.SetAttribute("charset", "utf-8") + + if err != nil { + panic(err) + } + + iter := xresult.NodeIter() + if iter.Next() { + n := iter.Node() + + err = n.AddChild(ele) + // childs, err := n.ChildNodes() + if err != nil { + t.Error(err) + } + t.Error(n) + } + + xr, err := doc.Find("//h1[ contains(@class, 'MovieTitle__Title')]") + if err != nil { + panic(nil) + } + + t.Error(xr) } func TestExtractor(t *testing.T) { @@ -96,6 +96,15 @@ func TestExtractor(t *testing.T) { collect := intimate.NewExtractorStore() store := intimate.NewSourceStore("source_openrec") source, err := store.Pop(string(intimate.TTOpenrecRanking), 100) + + anchorId := source.GetSource().String + + ai := &intimate.AnchorInfo{} + ai.SetAnchorId(anchorId) + ai.SetPlatform(string(intimate.Popenrec)) + + collect.InsertAnchorInfo(ai) + // if source != nil { // defer store.Restore(source) // } @@ -110,7 +119,7 @@ func TestExtractor(t *testing.T) { user := m["user"] - ai := &intimate.CollectLog{} + clog := &intimate.CollectLog{} extractor := hunter.NewExtractor([]byte(user.Str)) xp, err := extractor.XPathResult("//p[@class='c-global__user__count__row__right js-userCountFollowers']/text()") if err != nil { @@ -121,8 +130,8 @@ func TestExtractor(t *testing.T) { } followers := strings.ReplaceAll(xp.String(), ",", "") - followersInt, err := strconv.ParseInt(followers, 10, 64) + if err != nil { t.Error(err) } @@ -134,6 +143,7 @@ func TestExtractor(t *testing.T) { } else { t.Error(err) } + t.Error(source.GetSource()) t.Error(anchorName) // c-contents @@ -149,8 +159,8 @@ func TestExtractor(t *testing.T) { t.Error(err) } - ai.SetViews(sql.NullInt64{Int64: int64(viewsint), Valid: true}) - ai.SetIsShowing(1) + clog.SetViews(sql.NullInt64{Int64: int64(viewsint), Valid: true}) + clog.SetIsShowing(1) } var givers []interface{} @@ -167,50 +177,69 @@ func TestExtractor(t *testing.T) { giversbytes, err := json.Marshal(givers) if err != nil { t.Error(err) - ai.SetErrorMsg(sql.NullString{String: err.Error(), Valid: true}) + clog.SetErrorMsg(sql.NullString{String: err.Error(), Valid: true}) } else { - ai.SetGiver(giversbytes) + clog.SetGiver(giversbytes) } // MovieToolbar__Views-g5e6ic-13 iDRGyA livejson := m["user_live"] - f, err := os.OpenFile("./test.html", os.O_CREATE|os.O_TRUNC|os.O_RDWR, os.ModePerm) - if err != nil { - panic(err) - } - f.WriteString(livejson.String()) - t.Error(livejson) - extractor = hunter.NewExtractor([]byte(livejson.Str)) - xr, err := extractor.XPathResult("//h1[ contains(@class, 'MovieTitle__Title')]") + // f, err := os.OpenFile("./test.html", os.O_CREATE|os.O_TRUNC|os.O_RDWR, os.ModePerm) + // if err != nil { + // panic(err) + // } + // f.WriteString(livejson.String()) - if err != nil { - t.Error(err) - } - t.Error(xr) - iter := xr.NodeIter() - if iter.Next() { - t.Error(iter.Node().TextContent()) - ai.SetShowTitle(sql.NullString{String: iter.Node().TextContent(), Valid: true}) + extractor = hunter.NewExtractor([]byte(livejson.Str)) + // xr, err := extractor.XPathResult("//h1[ contains(@class, 'MovieTitle__Title')]") + // if err != nil { + // t.Error(err) + // } + + mathes := regexp.MustCompile("MovieTitle__Title.*>(.+)").FindStringSubmatch(livejson.Str) + if len(mathes) == 2 { + + clog.SetShowTitle(sql.NullString{String: mathes[1], Valid: true}) content, err := extractor.XPathResult("//meta[@itemprop='uploadDate']/@content") if err != nil { t.Error(err) } - if content.NodeIter().Next() { - t.Error(content.String()) + iter := content.NodeIter() + if iter.Next() { + tm, err := time.ParseInLocation("2006-01-02T15:04:05Z07:00", iter.Node().NodeValue(), time.Local) + if err != nil { + t.Error(err) + } + clog.SetShowStartTime(sql.NullTime{Time: tm.Local(), Valid: true}) + + duration, err := extractor.XPathResult("//meta[@itemprop='duration']/@content") + if err != nil { + t.Error(err) + } + + diter := duration.NodeIter() + if diter.Next() { + + dt, err := intimate.ParseDuration(diter.Node().NodeValue()) + if err != nil { + log.Println(err) + } + endtm := tm.Add(dt) + clog.SetShowEndTime(sql.NullTime{Time: endtm.Local(), Valid: true}) + } } } - t.Error(xr.String(), xr.NodeIter().Next(), xr.String()) - ai.SetGratuity(sql.NullInt64{Int64: gratuity, Valid: true}) - ai.SetPlatform(string(intimate.Popenrec)) - ai.SetFollowers(sql.NullInt64{Int64: int64(followersInt), Valid: true}) - ai.SetAnchorId(source.GetSource().String) - ai.SetUpdateTime(source.GetUpdateTime()) + clog.SetGratuity(sql.NullInt64{Int64: gratuity, Valid: true}) + clog.SetPlatform(string(intimate.Popenrec)) + clog.SetFollowers(sql.NullInt64{Int64: int64(followersInt), Valid: true}) + clog.SetAnchorId(anchorId) + clog.SetUpdateTime(source.GetUpdateTime()) - collect.InsertCollectLog(ai) + collect.InsertCollectLog(clog) } else { t.Error("data is not json:\n", string(sdata)) diff --git a/go.mod b/go.mod index 6575876..690761f 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/474420502/hunter v0.3.0 github.com/474420502/requests v1.6.0 github.com/go-sql-driver/mysql v1.5.0 + github.com/lestrrat-go/libxml2 v0.0.0-20200215080510-6483566f52cb github.com/tidwall/gjson v1.6.0 github.com/tidwall/pretty v1.0.1 // indirect golang.org/x/net v0.0.0-20200707034311-ab3426394381 // indirect diff --git a/sql/intimate_extractor.sql b/sql/intimate_extractor.sql index cb1d1e5..67a2d03 100644 --- a/sql/intimate_extractor.sql +++ b/sql/intimate_extractor.sql @@ -10,7 +10,7 @@ CREATE TABLE IF NOT EXISTS `anchor_info` ( `channel` varchar(128) DEFAULT NULL, `show_type` varchar(255) DEFAULT NULL, `ext` json DEFAULT NULL, - `update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + `update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, PRIMARY KEY (`uid`), UNIQUE KEY `platform_anchor_id_idx` (`platform`, `anchor_id`), KEY `platform_idx` (`platform`), @@ -35,9 +35,9 @@ CREATE TABLE IF NOT EXISTS `collect_log` ( `gratuity` bigint(11) DEFAULT NULL, `show_title` text DEFAULT NULL, - `show_start_time` timestamp NULL DEFAULT NULL, - `show_end_time` timestamp NULL DEFAULT NULL, - `update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + `show_start_time` timestamp NULL DEFAULT NULL, + `show_end_time` timestamp NULL DEFAULT NULL, + `update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, `ext` json DEFAULT NULL, `error_msg` text DEFAULT NULL, diff --git a/sql/intimate_source.sql b/sql/intimate_source.sql index fac4d51..8c489a3 100644 --- a/sql/intimate_source.sql +++ b/sql/intimate_source.sql @@ -8,7 +8,7 @@ CREATE TABLE IF NOT EXISTS `source_openrec` ( `source` longtext DEFAULT NULL, `ext` json DEFAULT NULL, `pass_gob` blob DEFAULT NULL, - `update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + `update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, `operator` int DEFAULT 0, `error_msg` text DEFAULT NULL, PRIMARY KEY(`uid`), diff --git a/store.go b/store.go index 4810691..e99ecc9 100644 --- a/store.go +++ b/store.go @@ -181,9 +181,10 @@ func NewExtractorStore() *ExtractorStore { */ // InsertAnchorInfo AnchorInfo表, 插入数据 -func (store *ExtractorStore) InsertAnchorInfo(isource IGetAnchorInfo) { +func (store *ExtractorStore) InsertAnchorInfo(isource IGetAnchorInfo) error { _, err := store.db.Exec("insert into "+AnchorTable+"(platform, anchor_id, anchor_name, live_url, channel, show_type, ext) values(?,?,?,?,?,?,?) ON DUPLICATE KEY UPDATE", isource.GetPlatform(), isource.GetAnchorId(), isource.GetAnchorName(), isource.GetLiveUrl(), isource.GetChannel(), isource.GetShowType(), isource.GetExt()) store.errorAlarm(err) + return err } /* @@ -209,9 +210,10 @@ func (store *ExtractorStore) InsertAnchorInfo(isource IGetAnchorInfo) { */ // InsertCollectLog CollectLog表插入数据 -func (store *ExtractorStore) InsertCollectLog(isource IGetCollectLog) { +func (store *ExtractorStore) InsertCollectLog(isource IGetCollectLog) error { _, err := store.db.Exec("insert into "+CollectLogTable+"(uid, platform, anchor_id, is_showing, is_error, followers, views, giver, gratuity, show_title, show_start_time, show_end_time, update_time, ext, error_msg) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", isource.GetUid(), isource.GetPlatform(), isource.GetAnchorId(), isource.GetIsShowing(), isource.GetIsError(), isource.GetFollowers(), isource.GetViews(), isource.GetGiver(), isource.GetGratuity(), isource.GetShowTitle(), isource.GetShowStartTime(), isource.GetShowEndTime(), isource.GetUpdateTime(), isource.GetExt(), isource.GetErrorMsg(), ) store.errorAlarm(err) + return err } diff --git a/tasks/openrec/openrec_task2/task_openrec.go b/tasks/openrec/openrec_task2/task_openrec.go index 6610541..286914c 100644 --- a/tasks/openrec/openrec_task2/task_openrec.go +++ b/tasks/openrec/openrec_task2/task_openrec.go @@ -35,7 +35,7 @@ func (oer *OpenrecExtratorRanking) Execute(cxt *hunter.TaskContext) { source, err := store.Pop(string(intimate.TTOpenrecUser)) - if source == nil && err != nil { + if source == nil || err != nil { log.Println(err) time.Sleep(time.Second * 2) continue diff --git a/utils.go b/utils.go new file mode 100644 index 0000000..f922bf5 --- /dev/null +++ b/utils.go @@ -0,0 +1,28 @@ +package intimate + +import ( + "log" + "time" +) + +var zeroTime time.Time + +func init() { + + tm, err := time.Parse("15:04:05", "0:00:00") + if err != nil { + log.Println(err) + } + zeroTime = tm + +} + +// ParseDuration time to duration eg: 1:40:00 -> time.Duration +func ParseDuration(dt string) (time.Duration, error) { + tdt, err := time.Parse("15:04:05", dt) + if err != nil { + + return time.Duration(0), err + } + return tdt.Sub(zeroTime), nil +}