Python 爬取新浪财经部分股票的历史交易数据

发布时间:2021-05-13 发表于话题:如何获取股票交易数据 点击:68 当前位置:财神股票资讯网 军事 Python 爬取新浪财经部分股票的历史交易数据 手机阅读

本文仅供学习交流,如有错误纰漏,还请谅解,欢迎大家一起来学习探讨!

参考资料(感谢!)爬取准备爬取思路模块1:网页表格数据爬取模块2:添加输出数据源代码(近期可能还要修改...)爬取近一个月的历史交易数据爬取近一年的历史交易数据

参考资料(感谢!)

配角七三—如何抓取网页中的表格:
https://zhuanlan.zhihu.com/p/33986020

爬取准备

import requests from bs4 import BeautifulSoup import pandas as pd import os import time import random

爬取思路

找到数据所在的网页,利用开发者工具,查看网页url,请求状态,源代码等等,然后定位数据元素。随后,进行编程。利用相关函数,模拟访问网页,采集数据,加以处理,并保存至本地。(细节之处不到位,还请见谅,博主还会再找时间另外总结)

模块1:网页表格数据爬取

def get_stock_table(stockcode,i): url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' + str( stockcode) + '.phtml?year=2019&jidu=' + str(i) print(url) res = requests.get(url) res.encoding = 'gbk' soup = BeautifulSoup(res.text, 'lxml') tables = soup.find_all('table', {'id': 'FundHoldSharesTable'}) df_list = [] for table in tables: df_list.append(pd.concat(pd.read_html(table.prettify()))) df = pd.concat(df_list) df.columns = df.iloc[0] headers = df.iloc[0] df = pd.DataFrame(df.values[1:], columns=headers) #print(len(df) - 1) #df中有几行数据 if (len(df) - 1 < 22): c =len(df)-1 df = add_stock_table(stockcode,i,c,df) else: df =pd.DataFrame(df.values[1:22], columns=headers) df = df.reset_index(drop=True) df.to_excel('...\\'+str(stockcode) +'.xlsx') sleeptime = random.randint(1, 10) #print(sleeptime) time.sleep(sleeptime)

不过以上函数可能有时候获取不到一个月的数据,因此还要再加一个函数,用以添加数据。如果要获取一年的话,就要加一个循环

模块2:添加输出数据

def add_stock_table(stockcode,i,c,df): i = i - 1 url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' + str(stockcode) + '.phtml?year=2019&jidu=' + str(i) #print(url) res = requests.get(url) res.encoding = 'gbk' soup = BeautifulSoup(res.text, 'lxml') tables = soup.find_all('table', {'id': 'FundHoldSharesTable'}) df_addlist = [] for table in tables: df_addlist.append(pd.concat(pd.read_html(table.prettify()))) df_add = pd.concat(df_addlist) headers = df_add.iloc[0] df_add = pd.DataFrame(df_add.values[1:random.randint(20, 22)-c], columns=headers) #print(df_add) df_sum = df.append(df_add) #print(df_sum) #print(len(df_sum)-1) return df_sum

谨记!本文仅供学习交流,如有错误纰漏,还请原谅,欢迎指教!博主较佛(懒),随缘修改!

源代码(近期可能还要修改…)

注意:
源代码无法直接套用!
.xlsx文件的路径需要修改!!
其他的根据自己的需要做变更。

爬取近一个月的历史交易数据

from bs4 import BeautifulSoup import requests import pandas as pd import os import time import random def get_stock_table(stockcode,i): url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' + str( stockcode) + '.phtml?year=2019&jidu=' + str(i) print(url) res = requests.get(url) res.encoding = 'gbk' soup = BeautifulSoup(res.text, 'lxml') tables = soup.find_all('table', {'id': 'FundHoldSharesTable'}) df_list = [] for table in tables: df_list.append(pd.concat(pd.read_html(table.prettify()))) df = pd.concat(df_list) df.columns = df.iloc[0] headers = df.iloc[0] df = pd.DataFrame(df.values[1:], columns=headers) #print(len(df) - 1) #df中有几行数据 if (len(df) - 1 < 22): c =len(df)-1 df = add_stock_table(stockcode,i,c,df) else: df =pd.DataFrame(df.values[1:22], columns=headers) df = df.reset_index(drop=True) df.to_excel('...\\'+str(stockcode) +'.xlsx') sleeptime = random.randint(1, 10) #print(sleeptime) time.sleep(sleeptime) def add_stock_table(stockcode,i,c,df): i = i - 1 url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' + str(stockcode) + '.phtml?year=2019&jidu=' + str(i) #print(url) res = requests.get(url) res.encoding = 'gbk' soup = BeautifulSoup(res.text, 'lxml') tables = soup.find_all('table', {'id': 'FundHoldSharesTable'}) df_addlist = [] for table in tables: df_addlist.append(pd.concat(pd.read_html(table.prettify()))) df_add = pd.concat(df_addlist) headers = df_add.iloc[0] df_add = pd.DataFrame(df_add.values[1:random.randint(20, 22)-c], columns=headers) #print(df_add) df_sum = df.append(df_add) #print(df_sum) #print(len(df_sum)-1) return df_sum if __name__ == "__main__": if os.path.exists("...\\601006.xlsx") == True: os.remove("...\\601006.xlsx") stockcode = ['601006', '000046', '601398', '000069', '601939', '000402', '000001', '000089', '000027', '399001', '000002', '000800', '601111', '600050', '601600', '600028', '601857', '601988', '000951', '601919'] i=2 index = 1 print("正在爬取month_stock信息...\n") print("---------------\n") print("请耐心等待...\n") for x in stockcode: print(index) get_stock_table(x,i) index +=1

爬取近一年的历史交易数据

from bs4 import BeautifulSoup import requests import pandas as pd import os import time import random def get_stock_yeartable(stockcode,s,y): url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' + str( stockcode) + '/type/S.phtml?year=' + str(y) + '&jidu=' + str(s) print(url) res = requests.get(url) res.encoding = 'gbk' soup = BeautifulSoup(res.text, 'lxml') tables = soup.find_all('table', {'id': 'FundHoldSharesTable'}) df_list = [] for table in tables: df_list.append(pd.concat(pd.read_html(table.prettify()))) df = pd.concat(df_list) df.columns = df.iloc[0] headers = df.iloc[0] df = pd.DataFrame(df.values[1:], columns=headers) #print(len(df) - 1) #df中有几行数据 while len(df)0: df = add_stock_table(stockcode,s,y,df) s -= 1 s = 5 y -= 1 df = df.reset_index(drop=True) df = pd.DataFrame(df.values[1:250], columns=headers) df.to_excel('D:\\Workplace\\PyCharm\\MySpider\\sh'+str(stockcode) +'.xlsx') sleeptime = random.randint(1, 10) #print(sleeptime) time.sleep(sleeptime) def add_stock_table(stockcode,s,y,df): print(y,"-",s) url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' + str( stockcode) + '/type/S.phtml?year=' + str(y) + '&jidu=' + str(s) print(url) res = requests.get(url) res.encoding = 'gbk' soup = BeautifulSoup(res.text, 'lxml') tables = soup.find_all('table', {'id': 'FundHoldSharesTable'}) df_addlist = [] for table in tables: df_addlist.append(pd.concat(pd.read_html(table.prettify()))) df_add = pd.concat(df_addlist) headers = df_add.iloc[0] df_add = pd.DataFrame(df_add.values[1:], columns=headers) #print(df_add) df_sum = df.append(df_add) #print(df_sum) #print(len(df_sum)-1) return df_sum if __name__ == "__main__": if os.path.exists("D:\\Workplace\\PyCharm\\MySpider\\sh000001.xlsx") == True: os.remove("D:\\Workplace\\PyCharm\\MySpider\\sh000001.xlsx") stockcode = ['000001'] s = 2 y = 2019 index = 1 print("正在爬取year_sh_stock信息...\n") print("---------------\n") print("请耐心等待...\n") for x in stockcode: print(index) get_stock_yeartable(x,s,y)

本文来源:https://www.thyysj.com/info/467064.html

标签组:[python] [python函数] [table

相关APP下载

热门话题

军事推荐文章

军事热门文章