python爬取接口数据写进数据库
文章类型:python
发布者:hp
发布时间:2024-11-26
在项目开发中,有时候我们需要爬取数据进数据库,那么面对接口请求方式的网站,我们应该怎么做呢
1:pymysql,用于 Python 连接 MySQL 数据库服务器的库
pip install pymysql
2:requests,专门用于发送HTTP请求
import requests
3:BeautifulSoup
pip install beautifulsoup4
1:获取网址,编写获取列表方法
newUrl=f"https://xxxxx&pageNum={i}&pageSize=100"
2:写入本地文件进行临时存储
with open('pqms.txt', 'w',encoding='utf-8') as f:
f.write(str(menuList))
# 列表地址抓取完毕,开始抓详情
print("写入本地文件成功")
3:读取本地文件地址,爬取网页
with open("pqms.txt", "r", encoding="utf-8") as f:
data = ast.literal_eval(f.read())
current_info = data[currentNum]
newUrl = f"https://xxxxxKey={current_info['exerciseKey']}"
4:批量写入数据库
connection=pymysql.connect(
host='localhost',
user='root',
password='root',
db='cs',
port = 3306,
charset="utf8"
)
try:
with connection.cursor() as cursor:
sql = "INSERT INTO anwer (uuid,title, questionAnswerRemark, createtime,gradeName,typeName,type,isshow,categoryName,updatetime) VALUES (%s,%s, %s, %s,%s, %s, %s,%s, %s,%s)"
values = uplist
# 执行批量插入
cursor.executemany(sql, values)
# 提交事务
connection.commit()
import requests
import pymysql
import re
import time
from bs4 import BeautifulSoup
import ast
class SimpleIdWorker:
def __init__(self):
self.last_timestamp = -1
self.sequence = 0
def next_id(self):
timestamp = int(time.time() * 1000)
if self.last_timestamp == timestamp:
self.sequence = (self.sequence + 1) % 10000
else:
self.sequence = 0
self.last_timestamp = timestamp
return (timestamp << 17) | self.sequence
connection=pymysql.connect(
host='localhost',
user='root',
password='root',
db='cs',
port = 3306,
charset="utf8"
)
print(connection)
# 连接到数据库
url="https://www.kczg.org.cn/article/detail?id=6292914"
# 创建一个 Session 对象
session = requests.Session()
# 设置请求头,模拟浏览器信息
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.289 Safari/537.36'
}
uplist = []
#抓取详情
#当前第几个
currentNum=0
#列表数组
menuList=[]
#函数一获取url列表
def getUrlList():
global menuList
for i in range(1,153):
print(f"第{i}页")
newUrl=f"https://xxxxx&pageNum={i}&pageSize=100"
print(newUrl)
response=session.get(newUrl,headers=headers)
response.encoding = "utf-8"
html=response.json()
soup = BeautifulSoup(response.text,features="html.parser")
print(len(html['data']['list']))
for m in html['data']['list']:
menuList.append(m)
if(len(menuList)>=1530):
print(f"列表地址抓取完毕,共{len(menuList)}条")
# 写入文件
with open('pqms.txt', 'w',encoding='utf-8') as f:
f.write(str(menuList))
# 列表地址抓取完毕,开始抓详情
print("写入本地文件成功")
def getDetail():
global uplist
global currentNum
with open("pqms.txt", "r", encoding="utf-8") as f:
data = ast.literal_eval(f.read())
current_info = data[currentNum]
newUrl = f"https://xxxxxKey={current_info['exerciseKey']}"
response = session.get(newUrl, headers=headers)
response.encoding = "utf-8"
html = response.json()
soup = BeautifulSoup(response.text, features="html.parser")
newInfo=html['data']
# item={
# "title":newInfo['title'],
# "questionAnswerRemark":newInfo['explanation'],
# "createtime":current_info['createAt'],
# "gradeName":current_info['level'],
# "typeName":newInfo['category'],
# "type":100,
# "isshow":1,
# "categoryName":'暂无'
#
# }
time=re.match(r"(\d{4}-\d{2}-\d{2})", current_info['createAt']).group(1)+" 00:00:00"
uplist.append((current_info['exerciseKey'],newInfo['title'],newInfo['explanation'],time,current_info['level'],newInfo['category'],100,1,'暂无','2024-11-26 00:00:00'))
if(len(uplist)==10):
try:
with connection.cursor() as cursor:
sql = "INSERT INTO anwer (uuid,title, questionAnswerRemark, createtime,gradeName,typeName,type,isshow,categoryName,updatetime) VALUES (%s,%s, %s, %s,%s, %s, %s,%s, %s,%s)"
values = uplist
# 执行批量插入
cursor.executemany(sql, values)
# 提交事务
connection.commit()
except Exception as e:
# 如果发生错误,回滚事务
connection.rollback()
print(f"An error occurred: {e}")
finally:
# 关闭数据库连接
# connection.close()
if(len(uplist)==10):
uplist = []
print('传递完毕,进行下次操作')
currentNum = currentNum + 1
getDetail()
else:
currentNum = currentNum + 1
getDetail()
#详情
# getDetail()
#列表
# getUrlList()