From f4da21d64526636666400cdbc1d28a06e8450fd6 Mon Sep 17 00:00:00 2001 From: spark Date: Thu, 13 Feb 2025 17:56:22 +0800 Subject: [PATCH] =?UTF-8?q?feat(backend):=E6=B7=BB=E5=8A=A0=E8=82=A1?= =?UTF-8?q?=E5=B8=82=E9=80=9A=E8=B5=84=E8=AE=AF=E7=88=AC=E5=8F=96=E5=8A=9F?= =?UTF-8?q?=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 SearchGuShiTongStockInfo函数,用于爬取百度股市通的股票资讯 - 修改 OpenAI_API 函数,增加股市通资讯的爬取 - 添加 RemoveAllNonDigitChar 函数,用于去除所有非数字字符 --- backend/data/crawler_api_test.go | 115 +++++++++++++++++++++++++++++++ backend/data/openai_api.go | 89 +++++++++++++++++++----- backend/data/utils.go | 6 ++ 3 files changed, 194 insertions(+), 16 deletions(-) create mode 100644 backend/data/crawler_api_test.go diff --git a/backend/data/crawler_api_test.go b/backend/data/crawler_api_test.go new file mode 100644 index 0000000..2501552 --- /dev/null +++ b/backend/data/crawler_api_test.go @@ -0,0 +1,115 @@ +package data + +import ( + "context" + "github.com/PuerkitoBio/goquery" + "github.com/duke-git/lancet/v2/strutil" + "go-stock/backend/logger" + "strings" + "testing" + "time" + + "github.com/chromedp/chromedp" + "github.com/stretchr/testify/assert" +) + +func TestNewTimeOutGuShiTongCrawler(t *testing.T) { + crawlerAPI := CrawlerApi{} + timeout := 10 + crawlerBaseInfo := CrawlerBaseInfo{ + Name: "TestCrawler", + Description: "Test Crawler Description", + BaseUrl: "https://gushitong.baidu.com", + Headers: map[string]string{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"}, + } + + result := crawlerAPI.NewTimeOutCrawler(timeout, crawlerBaseInfo) + assert.NotNil(t, result.crawlerCtx) + assert.Equal(t, crawlerBaseInfo, result.crawlerBaseInfo) +} + +func TestNewGuShiTongCrawler(t *testing.T) { + crawlerAPI := CrawlerApi{} + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + crawlerBaseInfo := CrawlerBaseInfo{ + Name: "TestCrawler", + Description: "Test Crawler Description", + BaseUrl: "https://gushitong.baidu.com", + Headers: map[string]string{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"}, + } + + result := crawlerAPI.NewCrawler(ctx, crawlerBaseInfo) + assert.Equal(t, ctx, result.crawlerCtx) + assert.Equal(t, crawlerBaseInfo, result.crawlerBaseInfo) +} + +func TestGetHtml(t *testing.T) { + crawlerAPI := CrawlerApi{} + crawlerBaseInfo := CrawlerBaseInfo{ + Name: "TestCrawler", + Description: "Test Crawler Description", + BaseUrl: "https://gushitong.baidu.com", + Headers: map[string]string{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"}, + } + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + crawlerAPI = crawlerAPI.NewCrawler(ctx, crawlerBaseInfo) + + url := "https://www.cls.cn/searchPage?type=depth&keyword=%E6%96%B0%E5%B8%8C%E6%9C%9B" + waitVisible := ".search-telegraph-list,.subject-interest-list" + + //url = "https://gushitong.baidu.com/stock/ab-600745" + //waitVisible = "div.news-item" + htmlContent, success := crawlerAPI.GetHtml(url, waitVisible, true) + if success { + document, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent)) + if err != nil { + logger.SugaredLogger.Error(err.Error()) + } + var messages []string + document.Find(waitVisible).Each(func(i int, selection *goquery.Selection) { + text := strutil.RemoveNonPrintable(selection.Text()) + messages = append(messages, text) + logger.SugaredLogger.Infof("搜索到消息-%s: %s", "", text) + }) + } + //logger.SugaredLogger.Infof("htmlContent:%s", htmlContent) +} + +func TestGetHtmlWithActions(t *testing.T) { + crawlerAPI := CrawlerApi{} + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + crawlerAPI = crawlerAPI.NewCrawler(ctx, CrawlerBaseInfo{ + Name: "百度股市通", + Description: "Test Crawler Description", + BaseUrl: "https://gushitong.baidu.com", + Headers: map[string]string{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"}, + }) + actions := []chromedp.Action{ + chromedp.Navigate("https://gushitong.baidu.com/stock/ab-600745"), + chromedp.WaitVisible("div.cos-tab"), + chromedp.Click("div.cos-tab:nth-child(5)", chromedp.ByQuery), + chromedp.ScrollIntoView("div.body-box"), + chromedp.WaitVisible("div.body-col"), + chromedp.Evaluate(`window.scrollTo(0, document.body.scrollHeight);`, nil), + chromedp.Sleep(1 * time.Second), + } + htmlContent, success := crawlerAPI.GetHtmlWithActions(&actions, true) + if success { + document, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent)) + if err != nil { + logger.SugaredLogger.Error(err.Error()) + } + var messages []string + document.Find("div.finance-hover,div.list-date").Each(func(i int, selection *goquery.Selection) { + text := strutil.RemoveNonPrintable(selection.Text()) + messages = append(messages, text) + logger.SugaredLogger.Infof("搜索到消息-%s: %s", "", text) + }) + logger.SugaredLogger.Infof("messages:%d", len(messages)) + } + //logger.SugaredLogger.Infof("htmlContent:%s", htmlContent) +} diff --git a/backend/data/openai_api.go b/backend/data/openai_api.go index a64fde1..cada5e8 100644 --- a/backend/data/openai_api.go +++ b/backend/data/openai_api.go @@ -191,22 +191,22 @@ func (o OpenAi) NewChatStream(stock, stockCode string) <-chan string { } }() - go func() { - defer wg.Done() - messages := SearchStockInfo(stock, "depth", o.CrawlTimeOut) - if messages == nil || len(*messages) == 0 { - logger.SugaredLogger.Error("获取股票资讯失败") - //ch <- "***❗获取股票资讯失败,分析结果可能不准确***
" - //go runtime.EventsEmit(o.ctx, "warnMsg", "❗获取股票资讯失败,分析结果可能不准确") - return - } - for _, message := range *messages { - msg = append(msg, map[string]interface{}{ - "role": "assistant", - "content": message, - }) - } - }() + //go func() { + // defer wg.Done() + // messages := SearchStockInfo(stock, "depth", o.CrawlTimeOut) + // if messages == nil || len(*messages) == 0 { + // logger.SugaredLogger.Error("获取股票资讯失败") + // //ch <- "***❗获取股票资讯失败,分析结果可能不准确***
" + // //go runtime.EventsEmit(o.ctx, "warnMsg", "❗获取股票资讯失败,分析结果可能不准确") + // return + // } + // for _, message := range *messages { + // msg = append(msg, map[string]interface{}{ + // "role": "assistant", + // "content": message, + // }) + // } + //}() go func() { defer wg.Done() messages := SearchStockInfo(stock, "telegram", o.CrawlTimeOut) @@ -223,6 +223,24 @@ func (o OpenAi) NewChatStream(stock, stockCode string) <-chan string { }) } }() + + go func() { + defer wg.Done() + messages := SearchGuShiTongStockInfo(stockCode, o.CrawlTimeOut) + if messages == nil || len(*messages) == 0 { + logger.SugaredLogger.Error("获取股势通资讯失败") + //ch <- "***❗获取股势通资讯失败,分析结果可能不准确***
" + //go runtime.EventsEmit(o.ctx, "warnMsg", "❗获取股势通资讯失败,分析结果可能不准确") + return + } + for _, message := range *messages { + msg = append(msg, map[string]interface{}{ + "role": "assistant", + "content": message, + }) + } + }() + wg.Wait() msg = append(msg, map[string]interface{}{ "role": "user", @@ -308,6 +326,45 @@ func (o OpenAi) NewChatStream(stock, stockCode string) <-chan string { return ch } +func SearchGuShiTongStockInfo(stock string, crawlTimeOut int64) *[]string { + crawlerAPI := CrawlerApi{} + ctx, cancel := context.WithTimeout(context.Background(), time.Duration(crawlTimeOut)*time.Second) + defer cancel() + + crawlerAPI = crawlerAPI.NewCrawler(ctx, CrawlerBaseInfo{ + Name: "百度股市通", + BaseUrl: "https://gushitong.baidu.com", + Headers: map[string]string{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"}, + }) + url := "https://gushitong.baidu.com/stock/ab-" + RemoveAllNonDigitChar(stock) + logger.SugaredLogger.Infof("SearchGuShiTongStockInfo搜索股票-%s: %s", stock, url) + actions := []chromedp.Action{ + chromedp.Navigate(url), + chromedp.WaitVisible("div.cos-tab"), + chromedp.Click("div.cos-tab:nth-child(5)", chromedp.ByQuery), + chromedp.ScrollIntoView("div.body-box"), + chromedp.WaitVisible("div.body-col"), + chromedp.Evaluate(`window.scrollTo(0, document.body.scrollHeight);`, nil), + chromedp.Sleep(1 * time.Second), + } + htmlContent, success := crawlerAPI.GetHtmlWithActions(&actions, true) + var messages []string + if success { + document, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent)) + if err != nil { + logger.SugaredLogger.Error(err.Error()) + return &[]string{} + } + document.Find("div.finance-hover,div.list-date").Each(func(i int, selection *goquery.Selection) { + text := strutil.RemoveNonPrintable(selection.Text()) + messages = append(messages, text) + logger.SugaredLogger.Infof("SearchGuShiTongStockInfo搜索到消息-%s: %s", "", text) + }) + logger.SugaredLogger.Infof("messages:%d", len(messages)) + } + return &messages +} + func GetFinancialReports(stockCode string, crawlTimeOut int64) *[]string { // 创建一个 chromedp 上下文 timeoutCtx, timeoutCtxCancel := context.WithTimeout(context.Background(), time.Duration(crawlTimeOut)*time.Second) diff --git a/backend/data/utils.go b/backend/data/utils.go index eb99c29..ea145b5 100644 --- a/backend/data/utils.go +++ b/backend/data/utils.go @@ -15,3 +15,9 @@ func removeAllSpaces(s string) string { re := regexp.MustCompile(`\s`) return re.ReplaceAllString(s, "") } + +// RemoveAllNonDigitChar 去除所有非数字字符 +func RemoveAllNonDigitChar(s string) string { + re := regexp.MustCompile(`\D`) + return re.ReplaceAllString(s, "") +}