refactor(stock):重构股票价格数据爬取功能

- 移除了不必要的 chromedp Cancel 调用
- 新增了对雪球网的爬虫测试用例
- 修改了股票价格信息的爬取逻辑,使用新浪财经作为数据源
- 优化了爬取结果的 Markdown 格式输出
- 删除了未使用的 validator包引用
This commit is contained in:
ArvinLovegood 2025-03-31 12:33:56 +08:00
parent 34e2de07fb
commit 5f8556cc3d
6 changed files with 94 additions and 88 deletions

View File

@ -68,7 +68,7 @@ func (c *CrawlerApi) GetHtml(url, waitVisible string, headless bool) (string, bo
defer pcancel() defer pcancel()
ctx, cancel := chromedp.NewContext(pctx, chromedp.WithLogf(logger.SugaredLogger.Infof)) ctx, cancel := chromedp.NewContext(pctx, chromedp.WithLogf(logger.SugaredLogger.Infof))
defer cancel() defer cancel()
defer chromedp.Cancel(ctx) //defer chromedp.Cancel(ctx)
err := chromedp.Run(ctx, chromedp.Navigate(url), err := chromedp.Run(ctx, chromedp.Navigate(url),
chromedp.WaitVisible(waitVisible, chromedp.ByQuery), // 确保 元素可见 chromedp.WaitVisible(waitVisible, chromedp.ByQuery), // 确保 元素可见
chromedp.WaitReady(waitVisible, chromedp.ByQuery), // 确保 元素准备好 chromedp.WaitReady(waitVisible, chromedp.ByQuery), // 确保 元素准备好
@ -81,7 +81,7 @@ func (c *CrawlerApi) GetHtml(url, waitVisible string, headless bool) (string, bo
} else { } else {
ctx, cancel := chromedp.NewContext(c.crawlerCtx, chromedp.WithLogf(logger.SugaredLogger.Infof)) ctx, cancel := chromedp.NewContext(c.crawlerCtx, chromedp.WithLogf(logger.SugaredLogger.Infof))
defer cancel() defer cancel()
defer chromedp.Cancel(ctx) //defer chromedp.Cancel(ctx)
err := chromedp.Run(ctx, chromedp.Navigate(url), chromedp.WaitVisible("body"), chromedp.InnerHTML("body", &htmlContent)) err := chromedp.Run(ctx, chromedp.Navigate(url), chromedp.WaitVisible("body"), chromedp.InnerHTML("body", &htmlContent))
if err != nil { if err != nil {
logger.SugaredLogger.Error(err.Error()) logger.SugaredLogger.Error(err.Error())
@ -199,7 +199,7 @@ func (c *CrawlerApi) GetHtmlWithActions(actions *[]chromedp.Action, headless boo
defer pcancel() defer pcancel()
ctx, cancel := chromedp.NewContext(pctx, chromedp.WithLogf(logger.SugaredLogger.Infof)) ctx, cancel := chromedp.NewContext(pctx, chromedp.WithLogf(logger.SugaredLogger.Infof))
defer cancel() defer cancel()
defer chromedp.Cancel(ctx) //defer chromedp.Cancel(ctx)
err := chromedp.Run(ctx, *actions...) err := chromedp.Run(ctx, *actions...)
if err != nil { if err != nil {
@ -209,7 +209,7 @@ func (c *CrawlerApi) GetHtmlWithActions(actions *[]chromedp.Action, headless boo
} else { } else {
ctx, cancel := chromedp.NewContext(c.crawlerCtx, chromedp.WithLogf(logger.SugaredLogger.Infof)) ctx, cancel := chromedp.NewContext(c.crawlerCtx, chromedp.WithLogf(logger.SugaredLogger.Infof))
defer cancel() defer cancel()
defer chromedp.Cancel(ctx) //defer chromedp.Cancel(ctx)
err := chromedp.Run(ctx, *actions...) err := chromedp.Run(ctx, *actions...)
if err != nil { if err != nil {

View File

@ -298,6 +298,39 @@ func TestUSSINA(t *testing.T) {
}) })
} }
func TestXueqiu(t *testing.T) {
db.Init("../../data/stock.db")
url := "https://finance.sina.com.cn/realstock/company/sz002906/nc.shtml"
crawlerAPI := CrawlerApi{}
crawlerBaseInfo := CrawlerBaseInfo{
Name: "TestCrawler",
Description: "Test Crawler Description",
BaseUrl: "https://finance.sina.com.cn",
Headers: map[string]string{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"},
}
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Minute)
defer cancel()
crawlerAPI = crawlerAPI.NewCrawler(ctx, crawlerBaseInfo)
html, ok := crawlerAPI.GetHtml(url, "div#hqDetails table", true)
if !ok {
return
}
document, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
logger.SugaredLogger.Error(err.Error())
}
//price
price := strutil.RemoveWhiteSpace(document.Find("div#price").First().Text(), false)
hqTime := strutil.RemoveWhiteSpace(document.Find("div#hqTime").First().Text(), false)
var markdown strings.Builder
markdown.WriteString("\n ## 当前股票数据:\n")
markdown.WriteString(fmt.Sprintf("### 当前股价:%s 时间:%s\n", price, hqTime))
GetTableMarkdown(document, "div#hqDetails table", &markdown)
}
type Tick struct { type Tick struct {
Code int `json:"code"` Code int `json:"code"`
Status string `json:"status"` Status string `json:"status"`

View File

@ -196,7 +196,7 @@ func (o OpenAi) NewChatStream(stock, stockCode, userQuestion string, sysPromptId
} }
msg = append(msg, map[string]interface{}{ msg = append(msg, map[string]interface{}{
"role": "user", "role": "user",
"content": stock + time.Now().Format(time.DateOnly) + "价格:" + price, "content": "\n## " + stock + "股价数据:\n" + price,
}) })
}() }()

View File

@ -15,7 +15,6 @@ import (
"github.com/duke-git/lancet/v2/convertor" "github.com/duke-git/lancet/v2/convertor"
"github.com/duke-git/lancet/v2/slice" "github.com/duke-git/lancet/v2/slice"
"github.com/duke-git/lancet/v2/strutil" "github.com/duke-git/lancet/v2/strutil"
"github.com/duke-git/lancet/v2/validator"
"github.com/go-resty/resty/v2" "github.com/go-resty/resty/v2"
"go-stock/backend/db" "go-stock/backend/db"
"go-stock/backend/logger" "go-stock/backend/logger"
@ -770,12 +769,12 @@ func GetRealTimeStockPriceInfo(ctx context.Context, stockCode string) (price, pr
func SearchStockPriceInfo(stockCode string, crawlTimeOut int64) *[]string { func SearchStockPriceInfo(stockCode string, crawlTimeOut int64) *[]string {
if strutil.HasPrefixAny(stockCode, []string{"SZ", "SH", "sh", "sz", "bj"}) { if strutil.HasPrefixAny(stockCode, []string{"SZ", "SH", "sh", "sz", "bj"}) {
if strutil.HasPrefixAny(stockCode, []string{"bj", "BJ"}) { //if strutil.HasPrefixAny(stockCode, []string{"bj", "BJ"}) {
stockCode = strutil.ReplaceWithMap(stockCode, map[string]string{ // stockCode = strutil.ReplaceWithMap(stockCode, map[string]string{
"bj": "", // "bj": "",
"BJ": "", // "BJ": "",
}) + ".BJ" // }) + ".BJ"
} //}
return getSHSZStockPriceInfo(stockCode, crawlTimeOut) return getSHSZStockPriceInfo(stockCode, crawlTimeOut)
} }
@ -893,90 +892,36 @@ func getHKStockPriceInfo(stockCode string, crawlTimeOut int64) *[]string {
} }
func getSHSZStockPriceInfo(stockCode string, crawlTimeOut int64) *[]string { func getSHSZStockPriceInfo(stockCode string, crawlTimeOut int64) *[]string {
var messages []string url := "https://finance.sina.com.cn/realstock/company/" + stockCode + "/nc.shtml"
url := "https://www.cls.cn/stock?code=" + stockCode crawlerAPI := CrawlerApi{}
// 创建一个 chromedp 上下文 crawlerBaseInfo := CrawlerBaseInfo{
timeoutCtx, timeoutCtxCancel := context.WithTimeout(context.Background(), time.Duration(crawlTimeOut)*time.Second) Name: "TestCrawler",
defer timeoutCtxCancel() Description: "Test Crawler Description",
var ctx context.Context BaseUrl: "https://finance.sina.com.cn",
var cancel context.CancelFunc Headers: map[string]string{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"},
path := getConfig().BrowserPath
logger.SugaredLogger.Infof("SearchStockPriceInfo BrowserPath:%s", path)
if path != "" {
pctx, pcancel := chromedp.NewExecAllocator(
timeoutCtx,
chromedp.ExecPath(path),
chromedp.Flag("headless", true),
)
defer pcancel()
ctx, cancel = chromedp.NewContext(
pctx,
chromedp.WithLogf(logger.SugaredLogger.Infof),
chromedp.WithErrorf(logger.SugaredLogger.Errorf),
)
} else {
ctx, cancel = chromedp.NewContext(
timeoutCtx,
chromedp.WithLogf(logger.SugaredLogger.Infof),
chromedp.WithErrorf(logger.SugaredLogger.Errorf),
)
} }
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(crawlTimeOut)*time.Second)
defer cancel() defer cancel()
crawlerAPI = crawlerAPI.NewCrawler(ctx, crawlerBaseInfo)
var htmlContent string html, ok := crawlerAPI.GetHtml(url, "div#hqDetails table", true)
if !ok {
var tasks chromedp.Tasks return &[]string{""}
tasks = append(tasks, chromedp.Navigate(url)) }
tasks = append(tasks, chromedp.WaitVisible("div.quote-change-box", chromedp.ByQuery)) document, err := goquery.NewDocumentFromReader(strings.NewReader(html))
tasks = append(tasks, chromedp.ActionFunc(func(ctx context.Context) error {
price, _ := FetchPrice(ctx)
logger.SugaredLogger.Infof("price:%s", price)
return nil
}))
tasks = append(tasks, chromedp.OuterHTML("html", &htmlContent, chromedp.ByQuery))
err := chromedp.Run(ctx, tasks)
if err != nil { if err != nil {
logger.SugaredLogger.Error(err.Error()) logger.SugaredLogger.Error(err.Error())
return &[]string{}
}
document, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
if err != nil {
logger.SugaredLogger.Error(err.Error())
return &[]string{}
} }
document.Find("div.quote-text-border,span.quote-price").Each(func(i int, selection *goquery.Selection) { //price
text := strutil.RemoveNonPrintable(selection.Text()) price := strutil.RemoveWhiteSpace(document.Find("div#price").First().Text(), false)
logger.SugaredLogger.Info(text) hqTime := strutil.RemoveWhiteSpace(document.Find("div#hqTime").First().Text(), false)
messages = append(messages, text)
}) var markdown strings.Builder
return &messages markdown.WriteString(fmt.Sprintf("### 当前股价:%s 时间:%s\n", price, hqTime))
GetTableMarkdown(document, "div#hqDetails table", &markdown)
return &[]string{markdown.String()}
} }
func FetchPrice(ctx context.Context) (string, error) {
var price string
timeout := time.After(10 * time.Second) // 设置超时时间为10秒
ticker := time.NewTicker(1 * time.Second) // 每秒尝试一次
defer ticker.Stop()
for {
select {
case <-timeout:
return "", fmt.Errorf("timeout reached while fetching price")
case <-ticker.C:
err := chromedp.Run(ctx, chromedp.Text("span.quote-price", &price, chromedp.BySearch))
if err != nil {
logger.SugaredLogger.Errorf("failed to fetch price: %v", err)
continue
}
logger.SugaredLogger.Infof("price:%s", price)
if price != "" && validator.IsNumberStr(price) {
return price, nil
}
}
}
}
func SearchStockInfo(stock, msgType string, crawlTimeOut int64) *[]string { func SearchStockInfo(stock, msgType string, crawlTimeOut int64) *[]string {
crawler := CrawlerApi{ crawler := CrawlerApi{
crawlerBaseInfo: CrawlerBaseInfo{ crawlerBaseInfo: CrawlerBaseInfo{

View File

@ -49,7 +49,7 @@ func TestSearchStockPriceInfo(t *testing.T) {
db.Init("../../data/stock.db") db.Init("../../data/stock.db")
//SearchStockPriceInfo("hk06030", 30) //SearchStockPriceInfo("hk06030", 30)
//SearchStockPriceInfo("sh600171", 30) //SearchStockPriceInfo("sh600171", 30)
//SearchStockPriceInfo("gb_aapl", 30) SearchStockPriceInfo("gb_aapl", 30)
SearchStockPriceInfo("bj430198", 30) SearchStockPriceInfo("bj430198", 30)
} }

View File

@ -1,6 +1,8 @@
package data package data
import ( import (
"github.com/PuerkitoBio/goquery"
"go-stock/backend/logger"
"regexp" "regexp"
"strings" "strings"
) )
@ -57,3 +59,29 @@ func ConvertTushareCodeToStockCode(stockCode string) string {
stockCode = strings.ToLower(RemoveAllDigitChar(stockCode)) + RemoveAllNonDigitChar(stockCode) stockCode = strings.ToLower(RemoveAllDigitChar(stockCode)) + RemoveAllNonDigitChar(stockCode)
return strings.ReplaceAll(stockCode, ".", "") return strings.ReplaceAll(stockCode, ".", "")
} }
func GetTableMarkdown(document *goquery.Document, waitVisible string, markdown *strings.Builder) {
document.Find(waitVisible).First().Find("tr").Each(func(index int, item *goquery.Selection) {
row := ""
item.Find("th, td").Each(func(i int, cell *goquery.Selection) {
text := cell.Text()
row += "|" + text
})
row += "|"
if index == 0 {
// Header row
markdown.WriteString(row + "\n")
// Separator row
separator := ""
item.Find("th, td").Each(func(i int, cell *goquery.Selection) {
separator += "|---"
})
markdown.WriteString(separator + "|\n")
} else {
// Data row
markdown.WriteString(row + "\n")
}
})
logger.SugaredLogger.Infof("\n%s", markdown.String())
}