0
#!/usr/bin/python
from TwitterSearch import *

import sys
import csv

tso = TwitterSearchOrder() # create a TwitterSearchOrder object
tso.set_keywords(['gmo']) # let's define all words we would like to have a look for
tso.set_language('en') # we want to see English tweets only
tso.set_include_entities(False) # and don't give us all those entity information

max_range = 1           # search range in kilometres
num_results = 500       # minimum results to obtain
outfile = "output.csv"


# create twitter API object
twitter = TwitterSearch(
                        access_token = "764537836884242432-GzJmUSL4hcC2DOJD71TiQXwCA0aGosz",
                        access_token_secret = "zDGYDeigRqDkmdqTgBOltcfNcNnfLwRZPkPLlnFyY3xqQ",
                        consumer_key = "Kr9ThiJWvPa1uTXZoj4O0YaSG",
                        consumer_secret = "ozGCkXtTCyCdOcL7ZFO4PJs85IaijjEuhl6iIdZU0AdH9CCoxS"
                        )

# Create an array of USA states
ustates = [
           "AL",
           "AK",
           "AS",
           "AZ",
           "AR",
           "CA",
           "CO",
           "CT",
           "DE",
           "DC",
           "FM",
           "FL",
           "GA",
           "GU",
           "HI",
           "ID",
           "IL",
           "IN",
           "IA",
           "KS",
           "KY",
           "LA",
           "ME",
           "MH",
           "MD",
           "MA",
           "MI",
           "MN",
           "MS",
           "MO",
           "MT",
           "NE",
           "NV",
           "NH",
           "NJ",
           "NM",
           "NY",
           "NC",
           "ND",
           "MP",
           "OH",
           "OK",
           "OR",
           "PW",
           "PA",
           "PR",
           "RI",
           "SC",
           "SD",
           "TN",
           "TX",
           "UT",
           "VT",
           "VI",
           "VA",
           "WA",
           "WV",
           "WI",
           "WY",
           "USA"
           ]

def linearSearch(item, obj, start=0):
    for i in range(start, len(obj)):
        if item == obj[i]:
            return True
    return False
# open a file to write (mode "w"), and create a CSV writer object
csvfile = file(outfile, "w")
csvwriter = csv.writer(csvfile)

# add headings to our CSV file
row = [ "user", "text", "place"]
csvwriter.writerow(row)

#-----------------------------------------------------------------------
# the twitter API only allows us to query up to 100 tweets at a time.
# to search for more, we will break our search up into 10 "pages", each
# of which will include 100 matching tweets.
#-----------------------------------------------------------------------
result_count = 0
last_id = None

while result_count <  num_results:
    # perform a search based on latitude and longitude
    # twitter API docs: https://dev.twitter.com/docs/api/1/get/search
    query = twitter.search_tweets_iterable(tso)

    for result in query:
        state = 0
        if result["place"]:
            user = result["user"]["screen_name"]
            text = result["text"]
            text = text.encode('utf-8', 'replace')
            place = result["place"]["full_name"]
            state = place.split(",")[1]
        if linearSearch(state,ustates):
            print state
            # now write this row to our CSV file
            row = [ user, text, place ]
            csvwriter.writerow(row)
            result_count += 1
        last_id = result["id"]

    print "got %d results" % result_count

csvfile.close()

我正在尝试按我的数组 usstates 对推文进行分类,但第二个 if 块似乎不起作用。我对此一无所知。我所做的是进行线性搜索,如果我的项目等于数组中的项目,我会将其写入 csv 文件。

4

1 回答 1

0

因为看起来问题是剩余一些空格,您可以使用.strip()它们来删除它们

>>> x=" WY "
>>> x.strip()
'WY'
>>> 

还有一些其他的提示

  1. 为了加快成员资格测试,ustates使用集合而不是列表,因为集合具有恒定的时间检查,而列表是线性搜索

  2. 打开文件的首选方法是使用上下文管理器,它确保在块末尾或块中出现错误时关闭文件。也使用 open 而不是 file

有了这些提示,代码应该看起来像

#!/usr/bin/python

... # all the previous stuff

# Create an set of USA states
ustates = {  
           "AL", "AK", "AS", "AZ", "AR",
           "CA", "CO", "CT",
           "DE", "DC",
           "FM", "FL",
           "GA", "GU",
           "HI",
           "ID", "IL", "IN", "IA",
           "KS", "KY",
           "LA",
           "ME", "MH", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "MP",
           "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND",
           "OH", "OK", "OR",
           "PW", "PA", "PR",
           "RI",
           "SC", "SD",
           "TN", "TX",
           "UT",
           "VT", "VI", "VA",
           "WA", "WV", "WI", "WY",
           "USA"
           } # that arrange is just to take less lines, while grouping them alphabetically 


# open a file to write (mode "w"), and create a CSV writer object
with open(outfile,"w") as csvfile:
    ...    # the rest is the same

    while result_count <  num_results:
        # perform a search based on latitude and longitude
        # twitter API docs: https://dev.twitter.com/docs/api/1/get/search
        query = twitter.search_tweets_iterable(tso)

        for result in query:
            state = 0
            if result["place"]:
                ... # all the other stuff
                state = state.strip()     #<--- the strip part, add the .upper() if needed or just in case
            if state in ustates:
                ... # all the other stuff
            ... # the rest of stuff

        print "got %d results" % result_count
于 2016-09-11T13:19:39.267 回答