python爬取接口数据写进数据库

文章类型:python

发布者:hp

发布时间:2024-11-26

一:概述

在项目开发中,有时候我们需要爬取数据进数据库,那么面对接口请求方式的网站,我们应该怎么做呢

二:使用库

1:pymysql,用于 Python 连接 MySQL 数据库服务器的库

pip install pymysql

2:requests,专门用于发送HTTP请求

import requests

3:BeautifulSoup

pip install beautifulsoup4

三:流程

1:获取网址,编写获取列表方法

 newUrl=f"https://xxxxx&pageNum={i}&pageSize=100"

2:写入本地文件进行临时存储

  with open('pqms.txt', 'w',encoding='utf-8') as f:
                f.write(str(menuList))
                # 列表地址抓取完毕,开始抓详情
            print("写入本地文件成功")

3:读取本地文件地址,爬取网页

   with open("pqms.txt", "r", encoding="utf-8") as f:
        data = ast.literal_eval(f.read())
    current_info = data[currentNum]
    newUrl = f"https://xxxxxKey={current_info['exerciseKey']}"

4:批量写入数据库

connection=pymysql.connect(
    host='localhost',
    user='root',
    password='root',
    db='cs',
       port = 3306,
       charset="utf8"
    )
     try:
            with connection.cursor() as cursor:
                sql = "INSERT INTO anwer (uuid,title, questionAnswerRemark, createtime,gradeName,typeName,type,isshow,categoryName,updatetime) VALUES (%s,%s, %s, %s,%s, %s, %s,%s, %s,%s)"
                values = uplist
                # 执行批量插入
                cursor.executemany(sql, values)
            # 提交事务
            connection.commit()

四:完整代码

import requests
import pymysql
import re
import time
from bs4 import BeautifulSoup
import ast
class SimpleIdWorker:
    def __init__(self):
        self.last_timestamp = -1
        self.sequence = 0
    def next_id(self):
        timestamp = int(time.time() * 1000)
        if self.last_timestamp == timestamp:
            self.sequence = (self.sequence + 1) % 10000
        else:
            self.sequence = 0
        self.last_timestamp = timestamp
        return (timestamp << 17) | self.sequence
connection=pymysql.connect(
    host='localhost',
    user='root',
    password='root',
    db='cs',
       port = 3306,
       charset="utf8"
    )
print(connection)
# 连接到数据库
url="https://www.kczg.org.cn/article/detail?id=6292914"
# 创建一个 Session 对象  
session = requests.Session()  
# 设置请求头,模拟浏览器信息  
headers = {  
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.289 Safari/537.36'  
}
uplist = []
#抓取详情
#当前第几个
currentNum=0
#列表数组
menuList=[]

#函数一获取url列表
def getUrlList():
    global menuList
    for i in range(1,153):
        print(f"第{i}页")
        newUrl=f"https://xxxxx&pageNum={i}&pageSize=100"
        print(newUrl)

        response=session.get(newUrl,headers=headers)
        response.encoding = "utf-8"
        html=response.json()
        soup = BeautifulSoup(response.text,features="html.parser")

        print(len(html['data']['list']))
        for m in html['data']['list']:
            menuList.append(m)
        if(len(menuList)>=1530):
            print(f"列表地址抓取完毕,共{len(menuList)}条")
            # 写入文件
            with open('pqms.txt', 'w',encoding='utf-8') as f:
                f.write(str(menuList))
                # 列表地址抓取完毕,开始抓详情
            print("写入本地文件成功")

def getDetail():
    global uplist
    global currentNum
    with open("pqms.txt", "r", encoding="utf-8") as f:
        data = ast.literal_eval(f.read())
    current_info = data[currentNum]
    newUrl = f"https://xxxxxKey={current_info['exerciseKey']}"
    response = session.get(newUrl, headers=headers)
    response.encoding = "utf-8"
    html = response.json()
    soup = BeautifulSoup(response.text, features="html.parser")
    newInfo=html['data']
    # item={
    #     "title":newInfo['title'],
    #     "questionAnswerRemark":newInfo['explanation'],
    #     "createtime":current_info['createAt'],
    #     "gradeName":current_info['level'],
    #     "typeName":newInfo['category'],
    #     "type":100,
    #     "isshow":1,
    #     "categoryName":'暂无'
    #
    # }

    time=re.match(r"(\d{4}-\d{2}-\d{2})", current_info['createAt']).group(1)+" 00:00:00"
    uplist.append((current_info['exerciseKey'],newInfo['title'],newInfo['explanation'],time,current_info['level'],newInfo['category'],100,1,'暂无','2024-11-26 00:00:00'))
    if(len(uplist)==10):
        try:
            with connection.cursor() as cursor:
                sql = "INSERT INTO anwer (uuid,title, questionAnswerRemark, createtime,gradeName,typeName,type,isshow,categoryName,updatetime) VALUES (%s,%s, %s, %s,%s, %s, %s,%s, %s,%s)"
                values = uplist
                # 执行批量插入
                cursor.executemany(sql, values)
            # 提交事务
            connection.commit()

        except Exception as e:
            # 如果发生错误,回滚事务
            connection.rollback()
            print(f"An error occurred: {e}")
        finally:
            # 关闭数据库连接
            # connection.close()
            if(len(uplist)==10):
                uplist = []
                print('传递完毕,进行下次操作')
                currentNum = currentNum + 1
                getDetail()
    else:
        currentNum = currentNum + 1
        getDetail()
#详情
# getDetail()
#列表        
# getUrlList()