1
Vote

Unable to load web pages with other encodings

description

check http://www.sohu.com, which is encoded in GB2312
when the page is downloaded, a routine needs to be in place to try to encode data char[] into UTF-8, parse the file, get the "<meta>" tag with "charset", and then extract the corresponding encoding method, then encoding it again with that encoding method.

file attachments

comments

matrixreloaded wrote Apr 6, 2010 at 4:09 PM

public string GetEncodingFromBody3(string data)
    {
        string encodingName = null;
        string dataAsAscii = data;
        string temp = string.Empty;
        string matchStep1 = null;
        if (dataAsAscii != null)
        {
                string pattern = "<\\s*((m|M)(e|E)(t|T)(a|A))[^>]*(\\s+[a-zA-Z0-9-]+\\s*=\\s*(\\w+|'[^']*'|\"[^\"]*\"))+[^>]*>";
                foreach (Match m in Regex.Matches(dataAsAscii, pattern))
                {
                    string x = m.ToString();
                    matchStep1 = x;
                    break;
                }
        }//matched <meta dadfadfadfadfa>, now get charset
        if (matchStep1 != null)
        {
            string pattern = "[c|C][h|H][a|A][r|R][s|S][e|E][t|T]\\s*=\\s*([\\w-]+|'[^']*'|\"[^\"]*\")+";
            foreach (Match m in Regex.Matches(matchStep1,pattern))
            {
                encodingName = m.ToString().ToLower().Replace("charset","").Replace("=","").Trim();
                break;
            }
        }
        return encodingName;
    }
//========================
//====code snippet===========
buffer = stream2.ToArray();
            string encodeName = "";
            //if (response.CharacterSet == null)
            //{
            str = encoding.GetString(buffer);
            //encodeName = GetEncodingFromBody(str);
            encodeName = GetEncodingFromBody3(str);
            //}else{

            //    encodeName = response.CharacterSet;
            //}

            if (encodeName != null && encodeName != "")
            {
                encoding = Encoding.GetEncoding(encodeName);
            }
            else
            {
                encoding = Encoding.GetEncoding("utf-8");
            }


            str = encoding.GetString(buffer);

wrote Feb 13, 2013 at 4:32 AM