Html document encoding can not be detect?

May 6, 2010 at 1:11 AM
Edited May 6, 2010 at 1:38 AM

public static Beattie.Modules.Core.Net.WebResponse webresponse = new Beattie.Modules.Core.Net.WebResponse();

Helper.HTMLCode = Helper.webresponse.GetWebResponse(txAuctionURL.Text);

the WebLibrary is not open source, I can not rewrite the code to fix the download html code bug.
I review the code of WebLibrary.dll, the bug is highlight.

                byte[] buf = new byte[0x2000];
                do
                {
                    count = resStream.Read(buf, 0, buf.Length);
                    if (count != 0)
                    {
                        this._bytesReceived += count;
                        tempString = Encoding.UTF8.GetString(buf, 0, count);  // the Encoding can be gb2312, not utf-8
                        sb.Append(tempString);
                    }
                    Application.DoEvents();
                }
                while (count > 0);
                req = null;

we get the html code any way, but we must get the html code encoding first.

protected HtmlDocument ParseUri(Uri pageUri, WebProxy proxy)
        {
            HttpWebRequest req = (HttpWebRequest)WebRequest.Create(pageUri);
            req.Method = "GET";
            req.AllowAutoRedirect = true;
            req.UserAgent = "Baiduspider+(+http://www.baidu.com/search/spider.htm)";
            req.Accept = "*/*";
            req.KeepAlive = true;
            if (proxy != null) req.Proxy = proxy;
            //req.Timeout = Convert.ToInt32(ConnectionTimeout.Value.TotalMilliseconds);
            //req.ReadWriteTimeout = Convert.ToInt32(ReadTimeout.Value.TotalMilliseconds);
            //req.CookieContainer = CookieContainer;

            try
            {
                WebResponse resp = req.GetResponse();
                using (Stream responseStream = resp.GetResponseStream())
                {
                    HtmlDocument htmlDoc = new HtmlDocument
                    {
                        OptionAddDebuggingAttributes = false,
                        OptionAutoCloseOnEnd = true,
                        OptionFixNestedTags = true,
                        OptionReadEncoding = true
                    };

                    using (MemoryStream reader = new MemoryStream())
                    {
                        const int bufferSize = 1024;
                        byte[] buffer = new byte[bufferSize];
                        int bytesRead, totalBytesRead = 0;
                        while ((bytesRead = responseStream.Read(buffer, 0, bufferSize)) > 0)
                        {
                            totalBytesRead += bytesRead;
                            reader.Write(buffer, 0, bytesRead);
                        }

                        Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
                        reader.Seek(0, SeekOrigin.Begin);
                        if (documentEncoding == null)
                        {
                            htmlDoc.Load(reader, true);
                        }
                        else
                        {
                            htmlDoc.Load(reader, documentEncoding, true);
                        }
                        return htmlDoc;
                    }
                }
            }
            catch (WebException e)
            {

            }
            finally
            {
                req = null;
            }

            return null;
        }