feat(backend):添加股市通资讯爬取功能

- 新增 SearchGuShiTongStockInfo函数,用于爬取百度股市通的股票资讯
- 修改 OpenAI_API 函数,增加股市通资讯的爬取
- 添加 RemoveAllNonDigitChar 函数,用于去除所有非数字字符
This commit is contained in:
spark 2025-02-13 17:56:22 +08:00
parent fc37440f6b
commit f4da21d645
3 changed files with 194 additions and 16 deletions

View File

@ -0,0 +1,115 @@
package data
import (
"context"
"github.com/PuerkitoBio/goquery"
"github.com/duke-git/lancet/v2/strutil"
"go-stock/backend/logger"
"strings"
"testing"
"time"
"github.com/chromedp/chromedp"
"github.com/stretchr/testify/assert"
)
func TestNewTimeOutGuShiTongCrawler(t *testing.T) {
crawlerAPI := CrawlerApi{}
timeout := 10
crawlerBaseInfo := CrawlerBaseInfo{
Name: "TestCrawler",
Description: "Test Crawler Description",
BaseUrl: "https://gushitong.baidu.com",
Headers: map[string]string{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"},
}
result := crawlerAPI.NewTimeOutCrawler(timeout, crawlerBaseInfo)
assert.NotNil(t, result.crawlerCtx)
assert.Equal(t, crawlerBaseInfo, result.crawlerBaseInfo)
}
func TestNewGuShiTongCrawler(t *testing.T) {
crawlerAPI := CrawlerApi{}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
crawlerBaseInfo := CrawlerBaseInfo{
Name: "TestCrawler",
Description: "Test Crawler Description",
BaseUrl: "https://gushitong.baidu.com",
Headers: map[string]string{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"},
}
result := crawlerAPI.NewCrawler(ctx, crawlerBaseInfo)
assert.Equal(t, ctx, result.crawlerCtx)
assert.Equal(t, crawlerBaseInfo, result.crawlerBaseInfo)
}
func TestGetHtml(t *testing.T) {
crawlerAPI := CrawlerApi{}
crawlerBaseInfo := CrawlerBaseInfo{
Name: "TestCrawler",
Description: "Test Crawler Description",
BaseUrl: "https://gushitong.baidu.com",
Headers: map[string]string{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"},
}
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
crawlerAPI = crawlerAPI.NewCrawler(ctx, crawlerBaseInfo)
url := "https://www.cls.cn/searchPage?type=depth&keyword=%E6%96%B0%E5%B8%8C%E6%9C%9B"
waitVisible := ".search-telegraph-list,.subject-interest-list"
//url = "https://gushitong.baidu.com/stock/ab-600745"
//waitVisible = "div.news-item"
htmlContent, success := crawlerAPI.GetHtml(url, waitVisible, true)
if success {
document, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
if err != nil {
logger.SugaredLogger.Error(err.Error())
}
var messages []string
document.Find(waitVisible).Each(func(i int, selection *goquery.Selection) {
text := strutil.RemoveNonPrintable(selection.Text())
messages = append(messages, text)
logger.SugaredLogger.Infof("搜索到消息-%s: %s", "", text)
})
}
//logger.SugaredLogger.Infof("htmlContent:%s", htmlContent)
}
func TestGetHtmlWithActions(t *testing.T) {
crawlerAPI := CrawlerApi{}
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
crawlerAPI = crawlerAPI.NewCrawler(ctx, CrawlerBaseInfo{
Name: "百度股市通",
Description: "Test Crawler Description",
BaseUrl: "https://gushitong.baidu.com",
Headers: map[string]string{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"},
})
actions := []chromedp.Action{
chromedp.Navigate("https://gushitong.baidu.com/stock/ab-600745"),
chromedp.WaitVisible("div.cos-tab"),
chromedp.Click("div.cos-tab:nth-child(5)", chromedp.ByQuery),
chromedp.ScrollIntoView("div.body-box"),
chromedp.WaitVisible("div.body-col"),
chromedp.Evaluate(`window.scrollTo(0, document.body.scrollHeight);`, nil),
chromedp.Sleep(1 * time.Second),
}
htmlContent, success := crawlerAPI.GetHtmlWithActions(&actions, true)
if success {
document, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
if err != nil {
logger.SugaredLogger.Error(err.Error())
}
var messages []string
document.Find("div.finance-hover,div.list-date").Each(func(i int, selection *goquery.Selection) {
text := strutil.RemoveNonPrintable(selection.Text())
messages = append(messages, text)
logger.SugaredLogger.Infof("搜索到消息-%s: %s", "", text)
})
logger.SugaredLogger.Infof("messages:%d", len(messages))
}
//logger.SugaredLogger.Infof("htmlContent:%s", htmlContent)
}

View File

@ -191,22 +191,22 @@ func (o OpenAi) NewChatStream(stock, stockCode string) <-chan string {
}
}()
go func() {
defer wg.Done()
messages := SearchStockInfo(stock, "depth", o.CrawlTimeOut)
if messages == nil || len(*messages) == 0 {
logger.SugaredLogger.Error("获取股票资讯失败")
//ch <- "***❗获取股票资讯失败,分析结果可能不准确***<hr>"
//go runtime.EventsEmit(o.ctx, "warnMsg", "❗获取股票资讯失败,分析结果可能不准确")
return
}
for _, message := range *messages {
msg = append(msg, map[string]interface{}{
"role": "assistant",
"content": message,
})
}
}()
//go func() {
// defer wg.Done()
// messages := SearchStockInfo(stock, "depth", o.CrawlTimeOut)
// if messages == nil || len(*messages) == 0 {
// logger.SugaredLogger.Error("获取股票资讯失败")
// //ch <- "***❗获取股票资讯失败,分析结果可能不准确***<hr>"
// //go runtime.EventsEmit(o.ctx, "warnMsg", "❗获取股票资讯失败,分析结果可能不准确")
// return
// }
// for _, message := range *messages {
// msg = append(msg, map[string]interface{}{
// "role": "assistant",
// "content": message,
// })
// }
//}()
go func() {
defer wg.Done()
messages := SearchStockInfo(stock, "telegram", o.CrawlTimeOut)
@ -223,6 +223,24 @@ func (o OpenAi) NewChatStream(stock, stockCode string) <-chan string {
})
}
}()
go func() {
defer wg.Done()
messages := SearchGuShiTongStockInfo(stockCode, o.CrawlTimeOut)
if messages == nil || len(*messages) == 0 {
logger.SugaredLogger.Error("获取股势通资讯失败")
//ch <- "***❗获取股势通资讯失败,分析结果可能不准确***<hr>"
//go runtime.EventsEmit(o.ctx, "warnMsg", "❗获取股势通资讯失败,分析结果可能不准确")
return
}
for _, message := range *messages {
msg = append(msg, map[string]interface{}{
"role": "assistant",
"content": message,
})
}
}()
wg.Wait()
msg = append(msg, map[string]interface{}{
"role": "user",
@ -308,6 +326,45 @@ func (o OpenAi) NewChatStream(stock, stockCode string) <-chan string {
return ch
}
func SearchGuShiTongStockInfo(stock string, crawlTimeOut int64) *[]string {
crawlerAPI := CrawlerApi{}
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(crawlTimeOut)*time.Second)
defer cancel()
crawlerAPI = crawlerAPI.NewCrawler(ctx, CrawlerBaseInfo{
Name: "百度股市通",
BaseUrl: "https://gushitong.baidu.com",
Headers: map[string]string{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"},
})
url := "https://gushitong.baidu.com/stock/ab-" + RemoveAllNonDigitChar(stock)
logger.SugaredLogger.Infof("SearchGuShiTongStockInfo搜索股票-%s: %s", stock, url)
actions := []chromedp.Action{
chromedp.Navigate(url),
chromedp.WaitVisible("div.cos-tab"),
chromedp.Click("div.cos-tab:nth-child(5)", chromedp.ByQuery),
chromedp.ScrollIntoView("div.body-box"),
chromedp.WaitVisible("div.body-col"),
chromedp.Evaluate(`window.scrollTo(0, document.body.scrollHeight);`, nil),
chromedp.Sleep(1 * time.Second),
}
htmlContent, success := crawlerAPI.GetHtmlWithActions(&actions, true)
var messages []string
if success {
document, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
if err != nil {
logger.SugaredLogger.Error(err.Error())
return &[]string{}
}
document.Find("div.finance-hover,div.list-date").Each(func(i int, selection *goquery.Selection) {
text := strutil.RemoveNonPrintable(selection.Text())
messages = append(messages, text)
logger.SugaredLogger.Infof("SearchGuShiTongStockInfo搜索到消息-%s: %s", "", text)
})
logger.SugaredLogger.Infof("messages:%d", len(messages))
}
return &messages
}
func GetFinancialReports(stockCode string, crawlTimeOut int64) *[]string {
// 创建一个 chromedp 上下文
timeoutCtx, timeoutCtxCancel := context.WithTimeout(context.Background(), time.Duration(crawlTimeOut)*time.Second)

View File

@ -15,3 +15,9 @@ func removeAllSpaces(s string) string {
re := regexp.MustCompile(`\s`)
return re.ReplaceAllString(s, "")
}
// RemoveAllNonDigitChar 去除所有非数字字符
func RemoveAllNonDigitChar(s string) string {
re := regexp.MustCompile(`\D`)
return re.ReplaceAllString(s, "")
}