爬虫接口

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import requests
import pandas as pd
from bs4 import BeautifulSoup


# 一、爬虫接口
def api(urls):
# 发送请求获取网页内容
response = requests.get(urls)

# 如果请求成功,返回状态码 200
if response.status_code == 200:
html_content = response # 获取网页内容
return html_content


# 二、静态爬取数据BS4
def bs():
jjwd = 'http://caf-qibei.com/index?type=html'
html = api(jjwd).text
# 解析HTML内容
soup = BeautifulSoup(html, 'html.parser')
return soup


# 三、基金温度
def temps():
# 查找所有的<tr>标签
tr_tags = bs().find_all('tr')

# 解析基金温度数据
temps = [
(
tr.find_all('a', href=True)[0].text.strip(), # 提取指数名称
float(tr.find('td', {'colspan': '3'}).text.strip()) # 提取基金温度
)
for tr in tr_tags
if len(tr.find_all('a', href=True)) >= 2 # 确保至少有2个<a>标签
and tr.find('td', {'colspan': '3'}) # 确保有有效的<td>标签
and tr.find('td', {'colspan': '3'}).text.strip().replace('.', '', 1).isdigit() # 确保基金温度有效
]
temp = pd.DataFrame(temps)
return temp


# 三、动态爬取数据json
def js(url):
html = api(jsl=url).json()
return html


etf_jsl = 'https://www.jisilu.cn/data/etf/etf_list/?___jsl=LST___t=1693819271762&volume=500&unit_total=2&rp=25'

# # 基金温度
# temp = temps()
# print(temp)

# 集思录数据
jsl = js(etf_jsl)['rows']
print(jsl)

1

1

2

1

3

1

3

1

3

1

四大魔盒

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# 定义爬虫接口函数
import requests
from bs4 import BeautifulSoup
import pandas as pd


def api(url):
# 发送请求获取网页内容
response = requests.get(url)

# 如果请求成功,返回状态码 200
if response.status_code == 200:
html_content = response.text # 获取网页内容

# 解析HTML内容
soup = BeautifulSoup(html_content, 'html.parser')
else:
print(f"请求失败,状态码:{response.status_code}")
return None

return soup


def extract_bond_yield(soup):
"""
从HTML中提取十年期国债收益率,标识符为 id="idEpvDate"。
"""
bond_yield_element = soup.find('td', {'id': 'idEpvDate'})
if bond_yield_element:
bond_yield = bond_yield_element.text.strip()
return bond_yield
return None


def clean_data(url):
soup = api(url)
if soup is None:
return None

# 提取十年期国债收益率
bond_yield = extract_bond_yield(soup)

# 查找所有的 <table> 标签
tables = soup.find_all('table', {'id': 'stock'})
cleaned_data = []

# 处理每个表格
for table in tables:
rows = table.find_all('tr')[2:] # 跳过表头

for row in rows:
# 提取每行的列数据
columns = row.find_all('td')
if not columns:
continue # 跳过没有数据的行

data = {}

# 提取PE_TTM
pe_ttm = columns[0].text.strip() if len(columns) > 0 else None
data['PE_TTM'] = pe_ttm

# 提取EPV(期望收益)
epv = columns[2].text.strip() if len(columns) > 2 else None
data['EPV'] = epv

# 提取股市吸引力
if '股市吸引力' in columns[0].text:
stock_attraction = columns[1].text.strip() if len(columns) > 1 else None
data['股市吸引力'] = stock_attraction

# 提取巴菲特指数
if '巴菲特指数' in columns[0].text:
buffett_index = columns[1].text.strip() if len(columns) > 1 else None
data['巴菲特指数'] = buffett_index

# 提取七日换手率
if '七日换手率' in columns[0].text:
turnover_rate = columns[1].text.strip() if len(columns) > 1 else None
data['七日换手率'] = turnover_rate

cleaned_data.append(data)

return cleaned_data, bond_yield


# 示例URL,替换为实际的网页URL
url = "http://caf-qibei.com/index?type=html" # 替换为实际的网页URL
data, bond_yield = clean_data(url)


# 处理其他数据的提取
if data:
PE_TTM = data[2]['PE_TTM']
GZ = bond_yield
EPV = data[2]['EPV']
XYL = data[5]['股市吸引力']
BFT = data[6]['巴菲特指数']
HSL = data[7]['七日换手率']
print(PE_TTM)
print(GZ)
print(EPV)
print(XYL)
print(BFT)
print(HSL)

基金温度

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import requests
import pandas as pd
from bs4 import BeautifulSoup


# 爬虫接口
def api(urls):
# 发送请求获取网页内容
response = requests.get(urls)

# 如果请求成功,返回状态码 200
if response.status_code == 200:
html_content = response.text # 获取网页内容

# 解析HTML内容
soup = BeautifulSoup(html_content, 'html.parser')

# 查找所有的<tr>标签
tr_tags = soup.find_all('tr')

# 解析基金温度数据
temps = [
(
tr.find_all('a', href=True)[0].text.strip(), # 提取指数名称
float(tr.find('td', {'colspan': '3'}).text.strip()) # 提取基金温度
)
for tr in tr_tags
if len(tr.find_all('a', href=True)) >= 2 # 确保至少有2个<a>标签
and tr.find('td', {'colspan': '3'}) # 确保有有效的<td>标签
and tr.find('td', {'colspan': '3'}).text.strip().replace('.', '', 1).isdigit() # 确保基金温度有效
]
temp = pd.DataFrame(temps)
print(temp)
else:
print(f"请求失败,状态码:{response.status_code}")


# 目标网站 URL
url = 'http://caf-qibei.com/index?type=html'
api(urls=url)