mirror of
https://github.com/ArvinLovegood/go-stock.git
synced 2025-07-19 00:00:09 +08:00
refactor(data):重构财务数据爬取功能
- 移除雪球爬虫测试,改为 sina 和 eastmoney 测试 - 新增eastmoney财务数据爬取支持 - 优化openai_api.go中的财务报告获取逻辑 - 使用通用爬虫API替代chromedp实现
This commit is contained in:
parent
5f8556cc3d
commit
f1e40e7d3b
@ -298,7 +298,7 @@ func TestUSSINA(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
func TestXueqiu(t *testing.T) {
|
||||
func TestSina(t *testing.T) {
|
||||
db.Init("../../data/stock.db")
|
||||
url := "https://finance.sina.com.cn/realstock/company/sz002906/nc.shtml"
|
||||
crawlerAPI := CrawlerApi{}
|
||||
@ -331,6 +331,34 @@ func TestXueqiu(t *testing.T) {
|
||||
|
||||
}
|
||||
|
||||
func TestDC(t *testing.T) {
|
||||
url := "https://emweb.securities.eastmoney.com/pc_hsf10/pages/index.html?type=web&code=sh600745#/cwfx"
|
||||
db.Init("../../data/stock.db")
|
||||
crawlerAPI := CrawlerApi{}
|
||||
crawlerBaseInfo := CrawlerBaseInfo{
|
||||
Name: "TestCrawler",
|
||||
Description: "Test Crawler Description",
|
||||
BaseUrl: "https://emweb.securities.eastmoney.com",
|
||||
Headers: map[string]string{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"},
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Minute)
|
||||
defer cancel()
|
||||
crawlerAPI = crawlerAPI.NewCrawler(ctx, crawlerBaseInfo)
|
||||
|
||||
var markdown strings.Builder
|
||||
markdown.WriteString("\n ## 财务数据:\n")
|
||||
html, ok := crawlerAPI.GetHtml(url, "div.report_table table", false)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
document, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
||||
if err != nil {
|
||||
logger.SugaredLogger.Error(err.Error())
|
||||
}
|
||||
GetTableMarkdown(document, "div.report_table table", &markdown)
|
||||
|
||||
}
|
||||
|
||||
type Tick struct {
|
||||
Code int `json:"code"`
|
||||
Status string `json:"status"`
|
||||
|
@ -499,99 +499,48 @@ func SearchGuShiTongStockInfo(stock string, crawlTimeOut int64) *[]string {
|
||||
}
|
||||
|
||||
func GetFinancialReports(stockCode string, crawlTimeOut int64) *[]string {
|
||||
url := "https://emweb.securities.eastmoney.com/pc_hsf10/pages/index.html?type=web&code=" + stockCode + "#/cwfx"
|
||||
waitVisible := "div.report_table table"
|
||||
if strutil.HasPrefixAny(stockCode, []string{"HK", "hk"}) {
|
||||
stockCode = strings.ReplaceAll(stockCode, "hk", "")
|
||||
stockCode = strings.ReplaceAll(stockCode, "HK", "")
|
||||
url = "https://emweb.securities.eastmoney.com/PC_HKF10/pages/home/index.html?code=" + stockCode + "&type=web&color=w#/NewFinancialAnalysis"
|
||||
waitVisible = "div table.commonTable"
|
||||
}
|
||||
if strutil.HasPrefixAny(stockCode, []string{"us", "gb_"}) {
|
||||
stockCode = strings.ReplaceAll(stockCode, "us", "")
|
||||
stockCode = strings.ReplaceAll(stockCode, "gb_", "")
|
||||
url = "https://emweb.securities.eastmoney.com/pc_usf10/pages/index.html?type=web&code=" + stockCode + "#/cwfx"
|
||||
waitVisible = "div.zyzb_table_detail table"
|
||||
|
||||
}
|
||||
|
||||
// 创建一个 chromedp 上下文
|
||||
timeoutCtx, timeoutCtxCancel := context.WithTimeout(context.Background(), time.Duration(crawlTimeOut)*time.Second)
|
||||
defer timeoutCtxCancel()
|
||||
var ctx context.Context
|
||||
var cancel context.CancelFunc
|
||||
path := getConfig().BrowserPath
|
||||
logger.SugaredLogger.Infof("GetFinancialReports path:%s", path)
|
||||
logger.SugaredLogger.Infof("GetFinancialReports搜索股票-%s: %s", stockCode, url)
|
||||
|
||||
if path != "" {
|
||||
pctx, pcancel := chromedp.NewExecAllocator(
|
||||
timeoutCtx,
|
||||
chromedp.ExecPath(path),
|
||||
chromedp.Flag("headless", true),
|
||||
chromedp.Flag("disable-javascript", false),
|
||||
chromedp.Flag("disable-gpu", true),
|
||||
chromedp.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"),
|
||||
chromedp.Flag("disable-background-networking", true),
|
||||
chromedp.Flag("enable-features", "NetworkService,NetworkServiceInProcess"),
|
||||
chromedp.Flag("disable-background-timer-throttling", true),
|
||||
chromedp.Flag("disable-backgrounding-occluded-windows", true),
|
||||
chromedp.Flag("disable-breakpad", true),
|
||||
chromedp.Flag("disable-client-side-phishing-detection", true),
|
||||
chromedp.Flag("disable-default-apps", true),
|
||||
chromedp.Flag("disable-dev-shm-usage", true),
|
||||
chromedp.Flag("disable-extensions", true),
|
||||
chromedp.Flag("disable-features", "site-per-process,Translate,BlinkGenPropertyTrees"),
|
||||
chromedp.Flag("disable-hang-monitor", true),
|
||||
chromedp.Flag("disable-ipc-flooding-protection", true),
|
||||
chromedp.Flag("disable-popup-blocking", true),
|
||||
chromedp.Flag("disable-prompt-on-repost", true),
|
||||
chromedp.Flag("disable-renderer-backgrounding", true),
|
||||
chromedp.Flag("disable-sync", true),
|
||||
chromedp.Flag("force-color-profile", "srgb"),
|
||||
chromedp.Flag("metrics-recording-only", true),
|
||||
chromedp.Flag("safebrowsing-disable-auto-update", true),
|
||||
chromedp.Flag("enable-automation", true),
|
||||
chromedp.Flag("password-store", "basic"),
|
||||
chromedp.Flag("use-mock-keychain", true),
|
||||
)
|
||||
defer pcancel()
|
||||
ctx, cancel = chromedp.NewContext(
|
||||
pctx,
|
||||
chromedp.WithLogf(logger.SugaredLogger.Infof),
|
||||
chromedp.WithErrorf(logger.SugaredLogger.Errorf),
|
||||
)
|
||||
} else {
|
||||
ctx, cancel = chromedp.NewContext(
|
||||
timeoutCtx,
|
||||
chromedp.WithLogf(logger.SugaredLogger.Infof),
|
||||
chromedp.WithErrorf(logger.SugaredLogger.Errorf),
|
||||
)
|
||||
db.Init("../../data/stock.db")
|
||||
crawlerAPI := CrawlerApi{}
|
||||
crawlerBaseInfo := CrawlerBaseInfo{
|
||||
Name: "TestCrawler",
|
||||
Description: "Test Crawler Description",
|
||||
BaseUrl: "https://emweb.securities.eastmoney.com",
|
||||
Headers: map[string]string{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"},
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(crawlTimeOut)*time.Second)
|
||||
defer cancel()
|
||||
var htmlContent string
|
||||
url := fmt.Sprintf("https://xueqiu.com/snowman/S/%s/detail#/ZYCWZB", stockCode)
|
||||
err := chromedp.Run(ctx,
|
||||
chromedp.Navigate(url),
|
||||
// 等待页面加载完成,可以根据需要调整等待时间
|
||||
chromedp.WaitVisible("table.table", chromedp.ByQuery),
|
||||
chromedp.OuterHTML("html", &htmlContent, chromedp.ByQuery),
|
||||
)
|
||||
crawlerAPI = crawlerAPI.NewCrawler(ctx, crawlerBaseInfo)
|
||||
|
||||
var markdown strings.Builder
|
||||
markdown.WriteString("\n## 财务数据:\n")
|
||||
html, ok := crawlerAPI.GetHtml(url, waitVisible, true)
|
||||
if !ok {
|
||||
return &[]string{""}
|
||||
}
|
||||
document, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
||||
if err != nil {
|
||||
logger.SugaredLogger.Error(err.Error())
|
||||
}
|
||||
document, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
|
||||
if err != nil {
|
||||
logger.SugaredLogger.Error(err.Error())
|
||||
return &[]string{}
|
||||
}
|
||||
var messages []string
|
||||
document.Find("table tr").Each(func(i int, selection *goquery.Selection) {
|
||||
tr := ""
|
||||
selection.Find("th,td").Each(func(i int, selection *goquery.Selection) {
|
||||
ret := selection.Find("p").First().Text()
|
||||
if ret == "" {
|
||||
ret = selection.Text()
|
||||
}
|
||||
text := strutil.RemoveNonPrintable(ret)
|
||||
tr += text + " "
|
||||
})
|
||||
logger.SugaredLogger.Infof("%s", tr+" \n")
|
||||
messages = append(messages, tr+" \n")
|
||||
})
|
||||
return &messages
|
||||
GetTableMarkdown(document, waitVisible, &markdown)
|
||||
return &[]string{markdown.String()}
|
||||
}
|
||||
|
||||
func GetTelegraphList(crawlTimeOut int64) *[]string {
|
||||
|
Loading…
x
Reference in New Issue
Block a user