0

下面是一个链图/beautifulsoup 刮板,它从这个网站上刮取医生的个人资料信息。

from bs4 import BeautifulSoup
import requests
import csv
from collections import ChainMap


def get_data(soup):
    default_data = {'name': 'n/a', 'clinic': 'n/a', 'profession': 'n/a', 'region': 'n/a', 'city': 'n/a'}

    for doctor in soup.select('.view-practitioners .practitioner'):
        doctor_data = {}

        if doctor.select_one('.practitioner__name').text.strip():
            doctor_data['name'] = doctor.select_one('.practitioner__name').text

        if doctor.select_one('.practitioner__clinic').text.strip():
            doctor_data['clinic'] = doctor.select_one('.practitioner__clinic').text

        if doctor.select_one('.practitioner__profession').text.strip():
            doctor_data['profession'] = doctor.select_one('.practitioner__profession').text

        if doctor.select_one('.practitioner__region').text.strip():
            doctor_data['region'] = doctor.select_one('.practitioner__region').text

        if doctor.select_one('.practitioner__city').text.strip():
            doctor_data['city'] = doctor.select_one('.practitioner__city').text

        yield ChainMap(doctor_data, default_data)


url = 'https://sportmedbc.com/practitioners?field_profile_first_name_value=&field_profile_last_name_value=&field_pract_profession_tid=All&city=&taxonomy_vocabulary_5_tid=All&page=%s'

for i in range(5):
    page=requests.get(url % i)
    soup = BeautifulSoup(page.text, 'lxml')

def print_data(header_text, data, key):
    print(header_text)
    for d in data:
        print(d[key])
    print()

data = list(get_data(soup))
print_data('[Names]', data, 'name')
print_data('[Clinic]', data, 'clinic')
print_data('[Profession]', data, 'profession')
print_data('[Taxonomy]', data, 'region')
print_data('[City]', data, 'city')


f=csv.writer('Sports_Medicine_List','w')
f.writerow(['Names','Clinic', 'Profession','Taxonomy','City'])
for i in range(len('Names')):
    f.writerow(['Names'[i],'Clinic'[i], 'Profession'[i],'Taxonomy'[i],'City'[i]])

代码运行没有错误,但是,我的 IDE 中没有显示 csv 输出。我认为这是因为我没有正确考虑链图变量,但我不完全确定。有人知道为什么吗?提前致谢!

4

2 回答 2

1

这是您可以考虑尝试的另一种方式:

import requests
from bs4 import BeautifulSoup
import csv

def get_data(link):
    for pagelink in [link.format(page) for page in range(5)]:
        res = requests.get(pagelink)
        soup = BeautifulSoup(res.text,"lxml")

        data = []
        for doctor in soup.select('.view-practitioners .practitioner'):
            doctor_data = {}

            doctor_data['name'] = doctor.select_one('.practitioner__name').text
            doctor_data['clinic'] = doctor.select_one('.practitioner__clinic').text
            doctor_data['profession'] = doctor.select_one('.practitioner__profession').text
            doctor_data['region'] = doctor.select_one('.practitioner__region').text
            doctor_data['city'] = doctor.select_one('.practitioner__city').text
            data.append(doctor_data)

        for item in data:
            writer.writerow(item)

if __name__ == '__main__':
    url = 'https://sportmedbc.com/practitioners?field_profile_first_name_value=&field_profile_last_name_value=&field_pract_profession_tid=All&city=&taxonomy_vocabulary_5_tid=All&page={}'
    with open("doctorsinfo.csv","w",newline="") as infile:
        fieldnames = ['name', 'clinic', 'profession', 'region', 'city']
        writer = csv.DictWriter(infile, fieldnames=fieldnames)
        writer.writeheader()
        get_data(url)
于 2018-08-12T18:32:35.630 回答
1

对于编写 csv 字典,您可以使用csv.DictWriter此处的文档ChainMap只是字典的一个版本):

from bs4 import BeautifulSoup
import requests
import csv
from collections import ChainMap

def get_data(soup):
    default_data = {'name': 'n/a', 'clinic': 'n/a', 'profession': 'n/a', 'region': 'n/a', 'city': 'n/a'}

    for doctor in soup.select('.view-practitioners .practitioner'):
        doctor_data = {}

        if doctor.select_one('.practitioner__name').text.strip():
            doctor_data['name'] = doctor.select_one('.practitioner__name').text

        if doctor.select_one('.practitioner__clinic').text.strip():
            doctor_data['clinic'] = doctor.select_one('.practitioner__clinic').text

        if doctor.select_one('.practitioner__profession').text.strip():
            doctor_data['profession'] = doctor.select_one('.practitioner__profession').text

        if doctor.select_one('.practitioner__region').text.strip():
            doctor_data['region'] = doctor.select_one('.practitioner__region').text

        if doctor.select_one('.practitioner__city').text.strip():
            doctor_data['city'] = doctor.select_one('.practitioner__city').text

        yield ChainMap(doctor_data, default_data)


url = 'https://sportmedbc.com/practitioners?field_profile_first_name_value=&field_profile_last_name_value=&field_pract_profession_tid=All&city=&taxonomy_vocabulary_5_tid=All&page=%s'

with open('data.csv', 'w', newline='') as csvfile:

    fieldnames = ['name', 'clinic', 'profession', 'region', 'city']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for i in range(5):
        page=requests.get(url % i)
        soup = BeautifulSoup(page.text, 'lxml')
        writer.writerows(get_data(soup)

)

这会将所有数据输出到data.csv文件。我的 LibreOffice 截图:

在此处输入图像描述

于 2018-08-12T07:27:08.107 回答