我有大约 50 000 个 HTML 文件的文件夹。我正在尝试编写打开文件的脚本,如果标题包含某个字符串,则应该删除文件。
这是我迄今为止的尝试:
import aiofiles
import glob
from natsort import natsorted
import asyncio
from bs4 import BeautifulSoup
import os
async def main():
i=0
htmls = glob.glob("CarsPages" + "//*.html")
for html in natsorted(htmls):
async with aiofiles.open(html, mode='r', encoding='UTF-8', errors='strict', buffering=1) as f:
contents = await f.read()
soup = BeautifulSoup(contents, features="lxml")
if "Best portal" in soup.title.get_text():
i+=1
os.close(html)
os.remove(html)
print("removing: ", html)
print("Removed: ", i, " pages")
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
但我得到:
os.close(html) TypeError: an integer is required (got type str)
不知道使用 aiofiles 打开后要使用哪些函数来关闭和删除?
编辑 - 基于@joao 答案的工作代码
import aiofiles
import glob
from natsort import natsorted
import asyncio
from bs4 import BeautifulSoup
import os
async def main():
i=0
htmls = glob.glob("CarsPages" + "//*.html")
for html in natsorted(htmls):
async with aiofiles.open(html, mode='r', encoding='UTF-8', errors='strict', buffering=1) as f:
contents = await f.read()
soup = BeautifulSoup(contents, features="lxml")
if "Best portal" in soup.title.get_text():
i+=1
os.remove(html)
print("removed: ", html)
print("Removed: ", i, " pages")
loop = asyncio.get_event_loop()
loop.run_until_complete(main())