下面是一个链图/beautifulsoup 刮板,它从这个网站上刮取医生的个人资料信息。
from bs4 import BeautifulSoup
import requests
import csv
from collections import ChainMap
def get_data(soup):
default_data = {'name': 'n/a', 'clinic': 'n/a', 'profession': 'n/a', 'region': 'n/a', 'city': 'n/a'}
for doctor in soup.select('.view-practitioners .practitioner'):
doctor_data = {}
if doctor.select_one('.practitioner__name').text.strip():
doctor_data['name'] = doctor.select_one('.practitioner__name').text
if doctor.select_one('.practitioner__clinic').text.strip():
doctor_data['clinic'] = doctor.select_one('.practitioner__clinic').text
if doctor.select_one('.practitioner__profession').text.strip():
doctor_data['profession'] = doctor.select_one('.practitioner__profession').text
if doctor.select_one('.practitioner__region').text.strip():
doctor_data['region'] = doctor.select_one('.practitioner__region').text
if doctor.select_one('.practitioner__city').text.strip():
doctor_data['city'] = doctor.select_one('.practitioner__city').text
yield ChainMap(doctor_data, default_data)
url = 'https://sportmedbc.com/practitioners?field_profile_first_name_value=&field_profile_last_name_value=&field_pract_profession_tid=All&city=&taxonomy_vocabulary_5_tid=All&page=%s'
for i in range(5):
page=requests.get(url % i)
soup = BeautifulSoup(page.text, 'lxml')
def print_data(header_text, data, key):
print(header_text)
for d in data:
print(d[key])
print()
data = list(get_data(soup))
print_data('[Names]', data, 'name')
print_data('[Clinic]', data, 'clinic')
print_data('[Profession]', data, 'profession')
print_data('[Taxonomy]', data, 'region')
print_data('[City]', data, 'city')
f=csv.writer('Sports_Medicine_List','w')
f.writerow(['Names','Clinic', 'Profession','Taxonomy','City'])
for i in range(len('Names')):
f.writerow(['Names'[i],'Clinic'[i], 'Profession'[i],'Taxonomy'[i],'City'[i]])
代码运行没有错误,但是,我的 IDE 中没有显示 csv 输出。我认为这是因为我没有正确考虑链图变量,但我不完全确定。有人知道为什么吗?提前致谢!