0

我正在尝试从此页面中抓取内容:https://www.google.com/search?hl=en&biw=1920&bih=956&tbm=shop&q=Xenon+12640&oq=Xenon+12640&aq=f&gs_l=serp.3...3743.3743。 0.3905.1.1.0.0.0.0.0.0..0.0.ekh..0.0.Hq3XS7AxFDU&sei=Dr_MT_WOM6nO2AWE25mTCA&gbv=2

我遇到的问题是在浏览器中打开该网址我得到了我需要抓取的所有内容,但在代码中抓取相同的链接,缺少两个(重要)部分,评论编号和评级,低于价格和卖家信息。这是来自 c# 内部 Web 客户端的屏幕截图:http: //gyazo.com/908a37c7f70712fba1f82ec90a604d4d.png? 1338822369

这是我试图获取内容的代码:

    public string navGet(string inURL, CookieContainer inCookieContainer, bool GZip, string proxyAddress, int proxyPort,string proxyUserName, string proxyPassword)
    {
        try
        {
        this.currentUrl = inURL;           
        HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create(inURL);
    webRequest.Timeout = this.TimeOutSetting;
    webRequest.CookieContainer = inCookieContainer;

    if (proxyAddress == "0" || proxyPort == 0)
    { }
    else
    {
        webRequest.Proxy = new WebProxy(proxyAddress, proxyPort);
        // Use login credentials to access proxy
        NetworkCredential networkCredential = new NetworkCredential(proxyUserName, proxyPassword);
        webRequest.Proxy.Credentials = networkCredential;
    }

    Uri destination = webRequest.Address;
    webRequest.KeepAlive = true;
    webRequest.Method = "GET";
    webRequest.Accept = "*/*";
    webRequest.Headers.Add("Accept-Language", "en-us");
    if (GZip)
    {
        webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
    }
        webRequest.AllowAutoRedirect = true;
    webRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; FunWebProducts; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";

        webRequest.ContentType = "text/xml";         
        //webRequest.CookieContainer.Add(inCookieContainer.GetCookies(destination));

    try
    {
        string strSessionID = inCookieContainer.GetCookies(destination)["PHPSESSID"].Value;
        webRequest.Headers.Add("Cookie", "USER_OK=1;PHPSESSID=" + strSessionID);
    }
    catch (Exception ex2)
    {

    }
        HttpWebResponse webResponse = (HttpWebResponse)webRequest.GetResponse();
        if (webRequest.HaveResponse)
        {
            // First handle cookies
            foreach(Cookie retCookie in webResponse.Cookies)
            {
                bool cookieFound = false;
                foreach(Cookie oldCookie in inCookieContainer.GetCookies(destination))
                {
                    if (retCookie.Name.Equals(oldCookie.Name))
                    {
                        oldCookie.Value = retCookie.Value;
                        cookieFound = true;
                    }
                }
                if (!cookieFound)
                    inCookieContainer.Add(retCookie);
            }                


            // Read response
        Stream responseStream = responseStream = webResponse.GetResponseStream();



        if (webResponse.ContentEncoding.ToLower().Contains("gzip"))
        {
            responseStream = new GZipStream(responseStream, CompressionMode.Decompress);
        }
        else if (webResponse.ContentEncoding.ToLower().Contains("deflate"))
        {
            responseStream = new DeflateStream(responseStream, CompressionMode.Decompress);
        }

        StreamReader stream = new StreamReader(responseStream, System.Text.Encoding.Default);

        string responseString = stream.ReadToEnd();
            stream.Close();
            this.currentUrl = webResponse.ResponseUri.ToString();
            this.currentAddress = webRequest.Address.ToString();               
            setViewState(responseString);
            return responseString;
        }
        throw new Exception("No response received from host.");
        return "An error was encountered";
        }
        catch(Exception ex)
        {
            //MessageBox.Show("NavGet:" + ex.Message);
            return ex.Message;
        }
    }
4

1 回答 1

0

看起来这是因为评论数量和评级是使用 Java 脚本(可能是 AJAX 或其他东西)动态生成的。在这种情况下,您需要分析页面在浏览器中加载时发生的额外流量,并查找此数据的传输位置或分析 JavaScript 代码以查看其生成方式。

于 2012-06-04T16:19:54.760 回答