refactor(stock):重构股票价格数据爬取功能

- 移除了不必要的 chromedp Cancel 调用
- 新增了对雪球网的爬虫测试用例
- 修改了股票价格信息的爬取逻辑,使用新浪财经作为数据源
- 优化了爬取结果的 Markdown 格式输出
- 删除了未使用的 validator包引用
This commit is contained in:
ArvinLovegood 2025-03-31 12:33:56 +08:00
parent 34e2de07fb
commit 5f8556cc3d
6 changed files with 94 additions and 88 deletions

View File

@ -68,7 +68,7 @@ func (c *CrawlerApi) GetHtml(url, waitVisible string, headless bool) (string, bo
defer pcancel()
ctx, cancel := chromedp.NewContext(pctx, chromedp.WithLogf(logger.SugaredLogger.Infof))
defer cancel()
defer chromedp.Cancel(ctx)
//defer chromedp.Cancel(ctx)
err := chromedp.Run(ctx, chromedp.Navigate(url),
chromedp.WaitVisible(waitVisible, chromedp.ByQuery), // 确保 元素可见
chromedp.WaitReady(waitVisible, chromedp.ByQuery), // 确保 元素准备好
@ -81,7 +81,7 @@ func (c *CrawlerApi) GetHtml(url, waitVisible string, headless bool) (string, bo
} else {
ctx, cancel := chromedp.NewContext(c.crawlerCtx, chromedp.WithLogf(logger.SugaredLogger.Infof))
defer cancel()
defer chromedp.Cancel(ctx)
//defer chromedp.Cancel(ctx)
err := chromedp.Run(ctx, chromedp.Navigate(url), chromedp.WaitVisible("body"), chromedp.InnerHTML("body", &htmlContent))
if err != nil {
logger.SugaredLogger.Error(err.Error())
@ -199,7 +199,7 @@ func (c *CrawlerApi) GetHtmlWithActions(actions *[]chromedp.Action, headless boo
defer pcancel()
ctx, cancel := chromedp.NewContext(pctx, chromedp.WithLogf(logger.SugaredLogger.Infof))
defer cancel()
defer chromedp.Cancel(ctx)
//defer chromedp.Cancel(ctx)
err := chromedp.Run(ctx, *actions...)
if err != nil {
@ -209,7 +209,7 @@ func (c *CrawlerApi) GetHtmlWithActions(actions *[]chromedp.Action, headless boo
} else {
ctx, cancel := chromedp.NewContext(c.crawlerCtx, chromedp.WithLogf(logger.SugaredLogger.Infof))
defer cancel()
defer chromedp.Cancel(ctx)
//defer chromedp.Cancel(ctx)
err := chromedp.Run(ctx, *actions...)
if err != nil {

View File

@ -298,6 +298,39 @@ func TestUSSINA(t *testing.T) {
})
}
func TestXueqiu(t *testing.T) {
db.Init("../../data/stock.db")
url := "https://finance.sina.com.cn/realstock/company/sz002906/nc.shtml"
crawlerAPI := CrawlerApi{}
crawlerBaseInfo := CrawlerBaseInfo{
Name: "TestCrawler",
Description: "Test Crawler Description",
BaseUrl: "https://finance.sina.com.cn",
Headers: map[string]string{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"},
}
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Minute)
defer cancel()
crawlerAPI = crawlerAPI.NewCrawler(ctx, crawlerBaseInfo)
html, ok := crawlerAPI.GetHtml(url, "div#hqDetails table", true)
if !ok {
return
}
document, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
logger.SugaredLogger.Error(err.Error())
}
//price
price := strutil.RemoveWhiteSpace(document.Find("div#price").First().Text(), false)
hqTime := strutil.RemoveWhiteSpace(document.Find("div#hqTime").First().Text(), false)
var markdown strings.Builder
markdown.WriteString("\n ## 当前股票数据:\n")
markdown.WriteString(fmt.Sprintf("### 当前股价:%s 时间:%s\n", price, hqTime))
GetTableMarkdown(document, "div#hqDetails table", &markdown)
}
type Tick struct {
Code int `json:"code"`
Status string `json:"status"`

View File

@ -196,7 +196,7 @@ func (o OpenAi) NewChatStream(stock, stockCode, userQuestion string, sysPromptId
}
msg = append(msg, map[string]interface{}{
"role": "user",
"content": stock + time.Now().Format(time.DateOnly) + "价格:" + price,
"content": "\n## " + stock + "股价数据:\n" + price,
})
}()

View File

@ -15,7 +15,6 @@ import (
"github.com/duke-git/lancet/v2/convertor"
"github.com/duke-git/lancet/v2/slice"
"github.com/duke-git/lancet/v2/strutil"
"github.com/duke-git/lancet/v2/validator"
"github.com/go-resty/resty/v2"
"go-stock/backend/db"
"go-stock/backend/logger"
@ -770,12 +769,12 @@ func GetRealTimeStockPriceInfo(ctx context.Context, stockCode string) (price, pr
func SearchStockPriceInfo(stockCode string, crawlTimeOut int64) *[]string {
if strutil.HasPrefixAny(stockCode, []string{"SZ", "SH", "sh", "sz", "bj"}) {
if strutil.HasPrefixAny(stockCode, []string{"bj", "BJ"}) {
stockCode = strutil.ReplaceWithMap(stockCode, map[string]string{
"bj": "",
"BJ": "",
}) + ".BJ"
}
//if strutil.HasPrefixAny(stockCode, []string{"bj", "BJ"}) {
// stockCode = strutil.ReplaceWithMap(stockCode, map[string]string{
// "bj": "",
// "BJ": "",
// }) + ".BJ"
//}
return getSHSZStockPriceInfo(stockCode, crawlTimeOut)
}
@ -893,90 +892,36 @@ func getHKStockPriceInfo(stockCode string, crawlTimeOut int64) *[]string {
}
func getSHSZStockPriceInfo(stockCode string, crawlTimeOut int64) *[]string {
var messages []string
url := "https://www.cls.cn/stock?code=" + stockCode
// 创建一个 chromedp 上下文
timeoutCtx, timeoutCtxCancel := context.WithTimeout(context.Background(), time.Duration(crawlTimeOut)*time.Second)
defer timeoutCtxCancel()
var ctx context.Context
var cancel context.CancelFunc
path := getConfig().BrowserPath
logger.SugaredLogger.Infof("SearchStockPriceInfo BrowserPath:%s", path)
if path != "" {
pctx, pcancel := chromedp.NewExecAllocator(
timeoutCtx,
chromedp.ExecPath(path),
chromedp.Flag("headless", true),
)
defer pcancel()
ctx, cancel = chromedp.NewContext(
pctx,
chromedp.WithLogf(logger.SugaredLogger.Infof),
chromedp.WithErrorf(logger.SugaredLogger.Errorf),
)
} else {
ctx, cancel = chromedp.NewContext(
timeoutCtx,
chromedp.WithLogf(logger.SugaredLogger.Infof),
chromedp.WithErrorf(logger.SugaredLogger.Errorf),
)
url := "https://finance.sina.com.cn/realstock/company/" + stockCode + "/nc.shtml"
crawlerAPI := CrawlerApi{}
crawlerBaseInfo := CrawlerBaseInfo{
Name: "TestCrawler",
Description: "Test Crawler Description",
BaseUrl: "https://finance.sina.com.cn",
Headers: map[string]string{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"},
}
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(crawlTimeOut)*time.Second)
defer cancel()
var htmlContent string
var tasks chromedp.Tasks
tasks = append(tasks, chromedp.Navigate(url))
tasks = append(tasks, chromedp.WaitVisible("div.quote-change-box", chromedp.ByQuery))
tasks = append(tasks, chromedp.ActionFunc(func(ctx context.Context) error {
price, _ := FetchPrice(ctx)
logger.SugaredLogger.Infof("price:%s", price)
return nil
}))
tasks = append(tasks, chromedp.OuterHTML("html", &htmlContent, chromedp.ByQuery))
err := chromedp.Run(ctx, tasks)
crawlerAPI = crawlerAPI.NewCrawler(ctx, crawlerBaseInfo)
html, ok := crawlerAPI.GetHtml(url, "div#hqDetails table", true)
if !ok {
return &[]string{""}
}
document, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
logger.SugaredLogger.Error(err.Error())
return &[]string{}
}
document, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
if err != nil {
logger.SugaredLogger.Error(err.Error())
return &[]string{}
}
document.Find("div.quote-text-border,span.quote-price").Each(func(i int, selection *goquery.Selection) {
text := strutil.RemoveNonPrintable(selection.Text())
logger.SugaredLogger.Info(text)
messages = append(messages, text)
//price
price := strutil.RemoveWhiteSpace(document.Find("div#price").First().Text(), false)
hqTime := strutil.RemoveWhiteSpace(document.Find("div#hqTime").First().Text(), false)
})
return &messages
var markdown strings.Builder
markdown.WriteString(fmt.Sprintf("### 当前股价:%s 时间:%s\n", price, hqTime))
GetTableMarkdown(document, "div#hqDetails table", &markdown)
return &[]string{markdown.String()}
}
func FetchPrice(ctx context.Context) (string, error) {
var price string
timeout := time.After(10 * time.Second) // 设置超时时间为10秒
ticker := time.NewTicker(1 * time.Second) // 每秒尝试一次
defer ticker.Stop()
for {
select {
case <-timeout:
return "", fmt.Errorf("timeout reached while fetching price")
case <-ticker.C:
err := chromedp.Run(ctx, chromedp.Text("span.quote-price", &price, chromedp.BySearch))
if err != nil {
logger.SugaredLogger.Errorf("failed to fetch price: %v", err)
continue
}
logger.SugaredLogger.Infof("price:%s", price)
if price != "" && validator.IsNumberStr(price) {
return price, nil
}
}
}
}
func SearchStockInfo(stock, msgType string, crawlTimeOut int64) *[]string {
crawler := CrawlerApi{
crawlerBaseInfo: CrawlerBaseInfo{

View File

@ -49,7 +49,7 @@ func TestSearchStockPriceInfo(t *testing.T) {
db.Init("../../data/stock.db")
//SearchStockPriceInfo("hk06030", 30)
//SearchStockPriceInfo("sh600171", 30)
//SearchStockPriceInfo("gb_aapl", 30)
SearchStockPriceInfo("gb_aapl", 30)
SearchStockPriceInfo("bj430198", 30)
}

View File

@ -1,6 +1,8 @@
package data
import (
"github.com/PuerkitoBio/goquery"
"go-stock/backend/logger"
"regexp"
"strings"
)
@ -57,3 +59,29 @@ func ConvertTushareCodeToStockCode(stockCode string) string {
stockCode = strings.ToLower(RemoveAllDigitChar(stockCode)) + RemoveAllNonDigitChar(stockCode)
return strings.ReplaceAll(stockCode, ".", "")
}
func GetTableMarkdown(document *goquery.Document, waitVisible string, markdown *strings.Builder) {
document.Find(waitVisible).First().Find("tr").Each(func(index int, item *goquery.Selection) {
row := ""
item.Find("th, td").Each(func(i int, cell *goquery.Selection) {
text := cell.Text()
row += "|" + text
})
row += "|"
if index == 0 {
// Header row
markdown.WriteString(row + "\n")
// Separator row
separator := ""
item.Find("th, td").Each(func(i int, cell *goquery.Selection) {
separator += "|---"
})
markdown.WriteString(separator + "|\n")
} else {
// Data row
markdown.WriteString(row + "\n")
}
})
logger.SugaredLogger.Infof("\n%s", markdown.String())
}