完成twitch 数据提取入库

This commit is contained in:
eson
2020-07-28 18:56:27 +08:00
parent 1d2f2d14c5
commit ac1ab81676
4 changed files with 212 additions and 65 deletions

View File

@@ -49,22 +49,51 @@ func (cl *ChannelLink) Execute() {
}
e.Click()
var hrefs map[string]bool = make(map[string]bool)
var delayerror = 5
var samecount = 0
for i := 0; i <= 200; i++ {
wd.KeyDown(selenium.EndKey)
time.Sleep(time.Second * 2)
}
elements, err := wd.FindElements(selenium.ByXPATH, "//span/a[contains(@data-a-target,'card-') and @href]")
if err != nil {
panic(err)
}
// xpath: //article//a[@data-a-target='preview-card-title-link']
for _, ele := range elements {
href, err := ele.GetAttribute("href")
cards, err := wd.FindElements(selenium.ByXPATH, "//span/a[contains(@data-a-target,'card-') and @href]")
if err != nil {
log.Println(err)
break
}
log.Println(href) // TODO: Save href
if len(cards) == samecount {
delayerror--
if delayerror <= 0 {
break
}
} else {
delayerror = 5
}
for _, card := range cards {
href, err := card.GetAttribute("href")
if err != nil {
log.Println(err)
} else {
hrefs[href] = true
}
}
samecount = len(cards)
if len(cards) > 10 {
log.Println(len(cards))
wd.ExecuteScript(`items = document.evaluate("//div[@data-target='directory-page__card-container']/../self::div[@data-target and @style]", document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
for (var i = 0; i < items.snapshotLength - 10; i++) { item = items.snapshotItem(i); item.remove() ;};`, nil)
}
time.Sleep(time.Millisecond * 200)
wd.KeyDown(selenium.EndKey)
time.Sleep(time.Millisecond * 200)
wd.KeyUp(selenium.EndKey)
time.Sleep(time.Millisecond * 2500)
}
for href := range hrefs {
// TODO: Save href
source := &intimate.Source{}
source.Source = sql.NullString{String: href, Valid: true}
source.Operator = 0
@@ -73,5 +102,6 @@ func (cl *ChannelLink) Execute() {
sstore.Insert(source)
}
log.Println("hrefs len:", len(hrefs))
sstore.Deduplicate(intimate.TTwitchChannel, "source")
}