From 7e3b36c7d0366365af4fed62d815a17fde5fec91 Mon Sep 17 00:00:00 2001 From: eson Date: Thu, 16 Jul 2020 15:25:55 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E8=A7=A3=E6=9E=90=E6=A8=A1?= =?UTF-8?q?=E5=9D=97=E7=A4=BA=E4=BE=8B.=20=E6=95=B0=E6=8D=AE=E5=BA=93?= =?UTF-8?q?=E5=85=A5=E5=BA=93=E6=B5=8B=E8=AF=95.=20TODO:=20=E8=B0=83?= =?UTF-8?q?=E6=95=B4=E7=A8=8B=E5=BA=8F=E5=90=AF=E5=8A=A8=E5=81=9C=E6=AD=A2?= =?UTF-8?q?(=E9=9D=9E=E6=9A=B4=E5=8A=9B=E5=85=B3=E9=97=AD).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extractor/openrec/openrec_extractor.go | 215 +++++++++++++++++++++++ extractor/openrec/openrec_test.go | 226 ++++++------------------- extractor_field.go | 189 ++++++++++----------- go.mod | 1 - go.sum | 4 - sql/intimate_extractor.sql | 10 +- store.go | 38 +---- 7 files changed, 361 insertions(+), 322 deletions(-) create mode 100644 extractor/openrec/openrec_extractor.go diff --git a/extractor/openrec/openrec_extractor.go b/extractor/openrec/openrec_extractor.go new file mode 100644 index 0000000..7e846ba --- /dev/null +++ b/extractor/openrec/openrec_extractor.go @@ -0,0 +1,215 @@ +package main + +import ( + "database/sql" + "encoding/json" + "intimate" + "log" + "regexp" + "strconv" + "strings" + "time" + + "github.com/tidwall/gjson" +) + +// OpenrecExtractor 提取方法 +type OpenrecExtractor struct { + user *intimate.ExtractorSource + userLive *intimate.ExtractorSource + supporters *intimate.ExtractorSource +} + +func (oe *OpenrecExtractor) extractFollowers(clog intimate.ISet) { + extractor := oe.user.GetExtractor() + xp, err := extractor.XPathResult("//p[@class='c-global__user__count__row__right js-userCountFollowers']/text()") + if err != nil { + log.Println(err) + } + if !xp.NodeIter().Next() { + log.Println("不存在粉丝数") + } + + followers := strings.ReplaceAll(xp.String(), ",", "") + followersInt, err := strconv.ParseInt(followers, 10, 64) + if err != nil { + log.Println(err) + } + + clog.Set("Followers", sql.NullInt64{Int64: followersInt, Valid: true}) +} + +func (oe *OpenrecExtractor) extractAnchorName(ai intimate.ISet) { + extractor := oe.user.GetExtractor() + xp, err := extractor.XPathResult("//p[@class='c-global__user__profile__list__name__text official-icon--after']/text()") + if xp.NodeIter().Next() { + anchorName := xp.String() + ai.Set("AnchorName", anchorName) + } else { + log.Println(err) + } +} + +func (oe *OpenrecExtractor) extractViewsAndLiveStreaming(clog intimate.ISet) { + extractor := oe.user.GetExtractor() + // c-contents + xp, err := extractor.XPathResult("//ul[@class='c-contents']//p[@class='c-thumbnailVideo__footer__liveCount']/text()") + if err != nil { + log.Println(err) + } + if xp.NodeIter().Next() { + views := regexp.MustCompile(`[0-9,]+`).FindString(xp.String()) + views = strings.ReplaceAll(views, ",", "") + viewsint, err := strconv.Atoi(views) + if err != nil { + log.Println(err) + } + + clog.Set("Views", sql.NullInt64{Int64: int64(viewsint), Valid: true}) + clog.Set("IsLiveStreaming", int32(1)) + } +} + +func (oe *OpenrecExtractor) extractGiversAndGratuity(clog intimate.ISet) { + // extractor := oe.user.GetExtractor() + giverjson := oe.supporters.GetSource() + var givers []interface{} + var gratuity int64 = 0 + + for _, v := range giverjson.Array() { + giverSource := gjson.Parse(v.String()) + for _, item := range giverSource.Get("data.items").Array() { + givers = append(givers, item.Map()) + gratuity += item.Get("total_yells").Int() + } + } + + giversbytes, err := json.Marshal(givers) + if err != nil { + log.Println(err) + clog.Set("ErrorMsg", sql.NullString{String: err.Error(), Valid: true}) + } else { + clog.Set("Giver", giversbytes) + } + + clog.Set("Gratuity", sql.NullInt64{Int64: gratuity, Valid: true}) +} + +func (oe *OpenrecExtractor) extractLive(clog intimate.ISet) { + extractor := oe.userLive.GetExtractor() + mathes := regexp.MustCompile("MovieTitle__Title[^>]+>(.{1,50})").FindStringSubmatch(oe.userLive.GetSource().Str) + if len(mathes) == 2 { + + clog.Set("LiveTitle", sql.NullString{String: mathes[1], Valid: true}) + + content, err := extractor.XPathResult("//meta[@itemprop='uploadDate']/@content") + if err != nil { + log.Println(err) + } + + iter := content.NodeIter() + if iter.Next() { + tm, err := time.ParseInLocation("2006-01-02T15:04:05Z07:00", iter.Node().NodeValue(), time.Local) + if err != nil { + log.Println(err) + } + clog.Set("LiveStartTime", sql.NullTime{Time: tm.Local(), Valid: true}) + + duration, err := extractor.XPathResult("//meta[@itemprop='duration']/@content") + if err != nil { + log.Println(err) + } + + diter := duration.NodeIter() + if diter.Next() { + + dt, err := intimate.ParseDuration(diter.Node().NodeValue()) + if err != nil { + log.Println(err) + } + endtm := tm.Add(dt) + clog.Set("LiveEndTime", sql.NullTime{Time: endtm.Local(), Valid: true}) + } + } + } +} + +func (oe *OpenrecExtractor) extractTags(clog intimate.ISet) { + var tags []string + matheslist := regexp.MustCompile(`TagButton__Button[^>]+>(.{1,100})]+>(.{1,50})").FindStringSubmatch(livejson.Str) - if len(mathes) == 2 { - - clog.SetShowTitle(sql.NullString{String: mathes[1], Valid: true}) - - content, err := extractor.XPathResult("//meta[@itemprop='uploadDate']/@content") - if err != nil { - t.Error(err) - } - - iter := content.NodeIter() - if iter.Next() { - tm, err := time.ParseInLocation("2006-01-02T15:04:05Z07:00", iter.Node().NodeValue(), time.Local) - if err != nil { - t.Error(err) - } - clog.SetShowStartTime(sql.NullTime{Time: tm.Local(), Valid: true}) - - duration, err := extractor.XPathResult("//meta[@itemprop='duration']/@content") - if err != nil { - t.Error(err) - } - - diter := duration.NodeIter() - if diter.Next() { - - dt, err := intimate.ParseDuration(diter.Node().NodeValue()) - if err != nil { - log.Println(err) - } - endtm := tm.Add(dt) - clog.SetShowEndTime(sql.NullTime{Time: endtm.Local(), Valid: true}) - } - } - } - - var tags []string - matheslist := regexp.MustCompile(`TagButton__Button[^>]+>(.{1,100})