29542

Java escape Html special Chars

Question:

how would I make this method nicer:

String l = text; // System.out.println("startet UmlautAnpassen"); l = l.replaceAll("$", "usd"); l = l.replaceAll("�", "xxx"); l = l.replaceAll("", "xxx"); l = l.replaceAll("Ä", "ä"); l = l.replaceAll("Ö", "ü"); l = l.replaceAll("Ãœ", "ö"); l = l.replaceAll("ä", "Ä"); l = l.replaceAll("ö", "Ü"); l = l.replaceAll("ü", "Ö"); l = l.replaceAll("ß", "ß"); if (l.contains("&#") && l.contains(";")) { if ((l.contains("&#x") || l.contains("&#X")) && l.contains(";")) { l = l.replaceAll("&#[xX]20;", " "); l = l.replaceAll("&#[xX]21;", "!"); l = l.replaceAll("&#[xX]22;", "\""); l = l.replaceAll("&#[xX]23;", "#"); l = l.replaceAll("&#[xX]24;", "usd"); l = l.replaceAll("&#[xX]25;", "%"); l = l.replaceAll("&#[xX]26;", "&"); l = l.replaceAll("&#[xX]27;", "'"); l = l.replaceAll("&#[xX]28;", "("); l = l.replaceAll("&#[xX]29;", ")"); l = l.replaceAll("&#[xX]2A;", "*"); l = l.replaceAll("&#[xX]2B;", "+"); l = l.replaceAll("&#[xX]2C;", ","); l = l.replaceAll("&#[xX]2D;", "-"); l = l.replaceAll("&#[xX]2E;", "."); l = l.replaceAll("&#[xX]2F;", "/"); l = l.replaceAll("&#[xX]30;", "0"); l = l.replaceAll("&#[xX]31;", "1"); l = l.replaceAll("&#[xX]32;", "2"); l = l.replaceAll("&#[xX]33;", "3"); l = l.replaceAll("&#[xX]34;", "4"); l = l.replaceAll("&#[xX]35;", "5"); l = l.replaceAll("&#[xX]36;", "6"); l = l.replaceAll("&#[xX]37;", "7"); l = l.replaceAll("&#[xX]38;", "8"); l = l.replaceAll("&#[xX]39;", "9"); l = l.replaceAll("&#[xX]3A;", ":"); l = l.replaceAll("&#[xX]3B;", ";"); l = l.replaceAll("&#[xX]3C;", "<"); l = l.replaceAll("&#[xX]3D;", "="); l = l.replaceAll("&#[xX]3E;", ">"); l = l.replaceAll("&#[xX]3F;", "?"); l = l.replaceAll("&#[xX]40;", "@"); l = l.replaceAll("&#[xX]41;", "A"); l = l.replaceAll("&#[xX]42;", "B"); l = l.replaceAll("&#[xX]43;", "C"); l = l.replaceAll("&#[xX]44;", "D"); l = l.replaceAll("&#[xX]45;", "E"); l = l.replaceAll("&#[xX]46;", "F"); l = l.replaceAll("&#[xX]47;", "G"); l = l.replaceAll("&#[xX]48;", "H"); l = l.replaceAll("&#[xX]49;", "I"); l = l.replaceAll("&#[xX]4A;", "J"); l = l.replaceAll("&#[xX]4B;", "K"); l = l.replaceAll("&#[xX]4C;", "L"); l = l.replaceAll("&#[xX]4D;", "M"); l = l.replaceAll("&#[xX]4E;", "N"); l = l.replaceAll("&#[xX]4F;", "O"); l = l.replaceAll("&#[xX]50;", "P"); l = l.replaceAll("&#[xX]51;", "Q"); l = l.replaceAll("&#[xX]52;", "R"); l = l.replaceAll("&#[xX]53;", "S"); l = l.replaceAll("&#[xX]54;", "T"); l = l.replaceAll("&#[xX]55;", "U"); l = l.replaceAll("&#[xX]56;", "V"); l = l.replaceAll("&#[xX]57;", "W"); l = l.replaceAll("&#[xX]58;", "X"); l = l.replaceAll("&#[xX]59;", "Y"); l = l.replaceAll("&#[xX]5A;", "Z"); l = l.replaceAll("&#[xX]5B;", "("); l = l.replaceAll("&#[xX]5C;", "\\/"); l = l.replaceAll("&#[xX]5D;", ")"); l = l.replaceAll("&#[xX]5E;", "^"); l = l.replaceAll("&#[xX]5F;", "_"); l = l.replaceAll("&#[xX]60;", " "); l = l.replaceAll("&#[xX]61;", "a"); l = l.replaceAll("&#[xX]62;", "b"); l = l.replaceAll("&#[xX]63;", "c"); l = l.replaceAll("&#[xX]64;", "d"); l = l.replaceAll("&#[xX]65;", "e"); l = l.replaceAll("&#[xX]66;", "f"); l = l.replaceAll("&#[xX]67;", "g"); l = l.replaceAll("&#[xX]68;", "h"); l = l.replaceAll("&#[xX]69;", "i"); l = l.replaceAll("&#[xX]6A;", "j"); l = l.replaceAll("&#[xX]6B;", "k"); l = l.replaceAll("&#[xX]6C;", "l"); l = l.replaceAll("&#[xX]6D;", "m"); l = l.replaceAll("&#[xX]6E;", "n"); l = l.replaceAll("&#[xX]6F;", "o"); l = l.replaceAll("&#[xX]70;", "p"); l = l.replaceAll("&#[xX]71;", "q"); l = l.replaceAll("&#[xX]72;", "r"); l = l.replaceAll("&#[xX]73;", "s"); l = l.replaceAll("&#[xX]74;", "t"); l = l.replaceAll("&#[xX]75;", "u"); l = l.replaceAll("&#[xX]76;", "v"); l = l.replaceAll("&#[xX]77;", "w"); l = l.replaceAll("&#[xX]78;", "x"); l = l.replaceAll("&#[xX]79;", "y"); l = l.replaceAll("&#[xX]7A;", "z"); l = l.replaceAll("&#[xX]7B;", "("); l = l.replaceAll("&#[xX]7C;", "/"); l = l.replaceAll("&#[xX]7D;", ")"); l = l.replaceAll("&#[xX]7E;", "~"); l = l.replaceAll("&#[xX][aA]0;", " "); l = l.replaceAll("&#[xX][aA]1;", "¡"); l = l.replaceAll("&#[xX][aA]2;", "cent"); l = l.replaceAll("&#[xX][aA]3;", "gbf"); l = l.replaceAll("&#[xX][aA]5;", "yen"); l = l.replaceAll("&#[xX][aA]7;", "§"); l = l.replaceAll("&#[xX][aA]9;", "copyright"); l = l.replaceAll("&#[xX][aA][bB]\\;", "<<"); l = l.replaceAll("&#[xX][aA][dD]\\;", ""); l = l.replaceAll("&#[xX][aA][eE]\\;", "reg"); l = l.replaceAll("&#[xX][bB]0;", "°"); l = l.replaceAll("&#[xX][bB]1;", "+/-"); l = l.replaceAll("&#[xX][bB]2;", "^2"); l = l.replaceAll("&#[xX][bB]3;", "^3"); l = l.replaceAll("&#[xX][bB]5;", "micro"); l = l.replaceAll("&#[xX][bB]7;", "-"); l = l.replaceAll("&#[xX][bB]8;", ","); l = l.replaceAll("&#[xX][bB]9;", "^1"); l = l.replaceAll("&#[xX][bB][aA];", "º"); l = l.replaceAll("&#[xX][bB][bB];", ">>"); l = l.replaceAll("&#[xX][bB][cC];", "1/4"); l = l.replaceAll("&#[xX][bB][dD];", "1/2"); l = l.replaceAll("&#[xX][bB][eE];", "3/4"); l = l.replaceAll("&#[xX][bB][fF];", "¿"); l = l.replaceAll("&#[xX][cC]0;", "À"); l = l.replaceAll("&#[xX][cC]1;", "Á"); l = l.replaceAll("&#[xX][cC]2;", "Â"); l = l.replaceAll("&#[xX][cC]3;", "Â"); l = l.replaceAll("&#[xX][cC]4;", "Ä"); l = l.replaceAll("&#[xX][cC]5;", "Å"); l = l.replaceAll("&#[xX][cC]6;", "Æ"); l = l.replaceAll("&#[xX][cC]7;", "Ç"); l = l.replaceAll("&#[xX][cC]8;", "È"); l = l.replaceAll("&#[xX][cC]9;", "É"); l = l.replaceAll("&#[xX][cC][aA];", "Ê"); l = l.replaceAll("&#[xX][cC][bB];", "Ë"); l = l.replaceAll("&#[xX][cC][cC];", "Ì"); l = l.replaceAll("&#[xX][cC][dD];", "Í"); l = l.replaceAll("&#[xX][cC][eE];", "Î"); l = l.replaceAll("&#[xX][cC][fF];", "Ï"); l = l.replaceAll("&#[xX][dD]0;", "Ð"); l = l.replaceAll("&#[xX][dD]1;", "Ñ"); l = l.replaceAll("&#[xX][dD]2;", "Ò"); l = l.replaceAll("&#[xX][dD]3;", "Ó"); l = l.replaceAll("&#[xX][dD]4;", "Ô"); l = l.replaceAll("&#[xX][dD]5;", "Õ"); l = l.replaceAll("&#[xX][dD]6;", "Ö"); l = l.replaceAll("&#[xX][dD]7;", "×"); l = l.replaceAll("&#[xX][dD]8;", "Ø"); l = l.replaceAll("&#[xX][dD]9;", "Ù"); l = l.replaceAll("&#[xX][dD][aA];", "Ú"); l = l.replaceAll("&#[xX][dD][bB];", "Û"); l = l.replaceAll("&#[xX][dD][cC];", "Ü"); l = l.replaceAll("&#[xX][dD][dD];", "Ý"); l = l.replaceAll("&#[xX][dD][eE];", "Þ"); l = l.replaceAll("&#[xX][dD][fF];", "ß"); l = l.replaceAll("&#[xX][eE]0;", "à"); l = l.replaceAll("&#[xX][eE]1;", "á"); l = l.replaceAll("&#[xX][eE]2;", "â"); l = l.replaceAll("&#[xX][eE]3;", "ã"); l = l.replaceAll("&#[xX][eE]4;", "ä"); l = l.replaceAll("&#[xX][eE]5;", "å"); l = l.replaceAll("&#[xX][eE]6;", "æ"); l = l.replaceAll("&#[xX][eE]7;", "ç"); l = l.replaceAll("&#[xX][eE]8;", "è"); l = l.replaceAll("&#[xX][eE]9;", "é"); l = l.replaceAll("&#[xX][eE][aA];", "ê"); l = l.replaceAll("&#[xX][eE][bB];", "ë"); l = l.replaceAll("&#[xX][eE][cC];", "ì"); l = l.replaceAll("&#[xX][eE][dD];", "í"); l = l.replaceAll("&#[xX][eE][eE];", "î"); l = l.replaceAll("&#[xX][eE][fF];", "ï"); l = l.replaceAll("&#[xX][fF]0;", "ð"); l = l.replaceAll("&#[xX][fF]1;", "ñ"); l = l.replaceAll("&#[xX][fF]2;", "ò"); l = l.replaceAll("&#[xX][fF]3;", "ó"); l = l.replaceAll("&#[xX][fF]4;", "ô"); l = l.replaceAll("&#[xX][fF]5;", "õ"); l = l.replaceAll("&#[xX][fF]6;", "ö"); l = l.replaceAll("&#[xX][fF]7;", "÷"); l = l.replaceAll("&#[xX][fF]8;", "ø"); l = l.replaceAll("&#[xX][fF]9;", "ù"); l = l.replaceAll("&#[xX][fF][aA];", "ú"); l = l.replaceAll("&#[xX][fF][bB];", "û"); l = l.replaceAll("&#[xX][fF][cC];", "ü"); l = l.replaceAll("&#[xX][fF][dD];", "ý"); l = l.replaceAll("&#[xX][fF][eE];", "þ"); l = l.replaceAll("&#[xX][fF][fF];", "ÿ"); l = l.replaceAll("&#[xX]\\w{2};", " "); } // end if ((l.contains("&#[xX]") || l.contains("&#X"))|| // l.contains(";")) if (l.contains("&#") && l.contains(";")) { l = l.replaceAll("&#131;", "ƒ"); l = l.replaceAll("&#133;", "..."); l = l.replaceAll("&#138;", "Š"); l = l.replaceAll("&#137;", "promille"); l = l.replaceAll("&#140;", "Œ"); l = l.replaceAll("&#154;", "š"); l = l.replaceAll("&#156;", "œ"); l = l.replaceAll("&#160;", " "); l = l.replaceAll("&#161;", "¡"); l = l.replaceAll("&#162;", "cent"); l = l.replaceAll("&#163;", "gbf"); l = l.replaceAll("&#165;", "yen"); l = l.replaceAll("&#167;", "§"); l = l.replaceAll("&#169;", "copyright"); l = l.replaceAll("&#171;", "<<"); l = l.replaceAll("&#173;", ""); l = l.replaceAll("&#174;", "reg"); l = l.replaceAll("&#176;", "°"); l = l.replaceAll("&#177;", "+/-"); l = l.replaceAll("&#178;", "^2"); l = l.replaceAll("&#179;", "^3"); l = l.replaceAll("&#180;", " "); l = l.replaceAll("&#181;", "micro"); l = l.replaceAll("&#184;", ","); l = l.replaceAll("&#185;", "^1"); l = l.replaceAll("&#186;", "º"); l = l.replaceAll("&#187;", ">>"); l = l.replaceAll("&#188;", "1/4"); l = l.replaceAll("&#189;", "1/2"); l = l.replaceAll("&#190;", "3/4"); l = l.replaceAll("&#191;", "¿"); l = l.replaceAll("&#192;", "À"); l = l.replaceAll("&#193;", "Á"); l = l.replaceAll("&#194;", "Â"); l = l.replaceAll("&#195;", "Ã"); l = l.replaceAll("&#196;", "Ä"); l = l.replaceAll("&#197;", "Å"); l = l.replaceAll("&#198;", "Æ"); l = l.replaceAll("&#199;", "Ç"); l = l.replaceAll("&#200;", "È"); l = l.replaceAll("&#201;", "É"); l = l.replaceAll("&#202;", "Ê"); l = l.replaceAll("&#203;", "Ë"); l = l.replaceAll("&#204;", "Ì"); l = l.replaceAll("&#205;", "Í"); l = l.replaceAll("&#206;", "Î"); l = l.replaceAll("&#207;", "Ï"); l = l.replaceAll("&#208;", "Ð"); l = l.replaceAll("&#209;", "Ñ"); l = l.replaceAll("&#210;", "Ò"); l = l.replaceAll("&#211;", "Ó"); l = l.replaceAll("&#212;", "Ô"); l = l.replaceAll("&#213;", "Õ"); l = l.replaceAll("&#214;", "Ö"); l = l.replaceAll("&#215;", "x"); l = l.replaceAll("&#216;", "Ø"); l = l.replaceAll("&#217;", "Ù"); l = l.replaceAll("&#218;", "Ú"); l = l.replaceAll("&#219;", "Û"); l = l.replaceAll("&#220;", "Ü"); l = l.replaceAll("&#221;", "Ý"); l = l.replaceAll("&#222;", "Þ"); l = l.replaceAll("&#223;", "ß"); l = l.replaceAll("&#224;", "à"); l = l.replaceAll("&#225;", "á"); l = l.replaceAll("&#226;", "â"); l = l.replaceAll("&#227;", "ã"); l = l.replaceAll("&#228;", "ä"); l = l.replaceAll("&#229;", "å"); l = l.replaceAll("&#230;", "æ"); l = l.replaceAll("&#231;", "ç"); l = l.replaceAll("&#232;", "è"); l = l.replaceAll("&#233;", "é"); l = l.replaceAll("&#234;", "ê"); l = l.replaceAll("&#235;", "ë"); l = l.replaceAll("&#236;", "ì"); l = l.replaceAll("&#237;", "í"); l = l.replaceAll("&#238;", "î"); l = l.replaceAll("&#239;", "ï"); l = l.replaceAll("&#240;", "ð"); l = l.replaceAll("&#241;", "ñ"); l = l.replaceAll("&#242;", "ò"); l = l.replaceAll("&#243;", "ó"); l = l.replaceAll("&#244;", "ô"); l = l.replaceAll("&#245;", "õ"); l = l.replaceAll("&#246;", "ö"); l = l.replaceAll("&#247;", "÷"); l = l.replaceAll("&#248;", "ø"); l = l.replaceAll("&#249;", "ù"); l = l.replaceAll("&#250;", "ú"); l = l.replaceAll("&#251;", "û"); l = l.replaceAll("&#252;", "ü"); l = l.replaceAll("&#253;", "ý"); l = l.replaceAll("&#254;", "þ"); l = l.replaceAll("&#255;", "ÿ"); l = l.replaceAll("&#34;", "\""); l = l.replaceAll("&#38;", "&"); l = l.replaceAll("&#60;", "<"); l = l.replaceAll("&#62;", ">"); l = l.replaceAll("&#039;", "\'"); l = l.replaceAll("&#8482;", "eur"); l = l.replaceAll("&#8730;", "/"); l = l.replaceAll("&#\\d{2,5};", " "); } // end if (l.contains("&#") && l.contains(";")) if (l.contains("&") && l.contains(";")) { l = l.replaceAll("&nbsp;", " "); l = l.replaceAll("&iexcl;", "¡"); l = l.replaceAll("&cent;", "cent"); l = l.replaceAll("&pound;", "gbf"); l = l.replaceAll("&yen;", "yen"); l = l.replaceAll("&euro;", "eur"); l = l.replaceAll("&copy;", "copyright"); l = l.replaceAll("&laquo;", "<<"); l = l.replaceAll("&reg;", "reg"); l = l.replaceAll("&deg;", "°"); l = l.replaceAll("&plusmn;", "+/-"); l = l.replaceAll("&permill;", "promille"); l = l.replaceAll("&micro;", "micro"); l = l.replaceAll("&radic;", "wurzel"); l = l.replaceAll("&ordm;", "º"); l = l.replaceAll("&raquo;", ">>"); l = l.replaceAll("&frac14;", "1/4"); l = l.replaceAll("&frac12;", "1/2"); l = l.replaceAll("&frac34;", "3/4"); l = l.replaceAll("&iquest;", "¿"); l = l.replaceAll("&Agrave;", "À"); l = l.replaceAll("&Aacute;", "Á"); l = l.replaceAll("&Acirc;", "Â"); l = l.replaceAll("&Atilde;", "Ã"); l = l.replaceAll("&Auml;", "Ä"); l = l.replaceAll("&Aring;", "Å"); l = l.replaceAll("&AElig;", "Æ"); l = l.replaceAll("&Ccedil;", "Ç"); l = l.replaceAll("&Egrave;", "È"); l = l.replaceAll("&Eacute;", "É"); l = l.replaceAll("&Ecirc;", "Ê"); l = l.replaceAll("&Euml;", "Ë"); l = l.replaceAll("&Igrave;", "Ì"); l = l.replaceAll("&Iacute;", "Í"); l = l.replaceAll("&Icirc;", "Î"); l = l.replaceAll("&Iuml;", "Ï"); l = l.replaceAll("&ETH;", "Ð"); l = l.replaceAll("&Ntilde;", "Ñ"); l = l.replaceAll("&Ograve;", "Ò"); l = l.replaceAll("&Oacute;", "Ó"); l = l.replaceAll("&Ocirc;", "Ô"); l = l.replaceAll("&Otilde;", "Õ"); l = l.replaceAll("&Ouml;", "Ö"); l = l.replaceAll("&OElig;", "Œ"); l = l.replaceAll("&times;", "x"); l = l.replaceAll("&Oslash;", "Ø"); l = l.replaceAll("&Ugrave;", "Ù"); l = l.replaceAll("&Uacute;", "Ú"); l = l.replaceAll("&Ucirc;", "Û"); l = l.replaceAll("&Uuml;", "Ü"); l = l.replaceAll("&Yacute;", "Ý"); l = l.replaceAll("&THORN;", "Þ"); l = l.replaceAll("&szlig;", "ß"); l = l.replaceAll("&agrave;", "à"); l = l.replaceAll("&aacute;", "á"); l = l.replaceAll("&acirc;", "â"); l = l.replaceAll("&atilde;", "ã"); l = l.replaceAll("&auml;", "ä"); l = l.replaceAll("&aring;", "å"); l = l.replaceAll("&aelig;", "æ"); l = l.replaceAll("&ccedil;", "ç"); l = l.replaceAll("&egrave;", "è"); l = l.replaceAll("&eacute;", "é"); l = l.replaceAll("&ecirc;", "ê"); l = l.replaceAll("&euml;", "ë"); l = l.replaceAll("&igrave;", "ì"); l = l.replaceAll("&iacute;", "í"); l = l.replaceAll("&icirc;", "î"); l = l.replaceAll("&iuml;", "ï"); l = l.replaceAll("&eth;", "ð"); l = l.replaceAll("&ntilde;", "ñ"); l = l.replaceAll("&ograve;", "ò"); l = l.replaceAll("&oacute;", "ó"); l = l.replaceAll("&ocirc;", "ô"); l = l.replaceAll("&otilde;", "õ"); l = l.replaceAll("&ouml;", "ö"); l = l.replaceAll("&oelig;", "œ"); l = l.replaceAll("&divide;", "/"); l = l.replaceAll("&oslash;", "ø"); l = l.replaceAll("&ugrave;", "ù"); l = l.replaceAll("&uacute;", "ú"); l = l.replaceAll("&ucirc;", "û"); l = l.replaceAll("&uuml;", "ü"); l = l.replaceAll("&yacute;", "ý"); l = l.replaceAll("&thorn;", "þ"); l = l.replaceAll("&yuml;", "ÿ"); l = l.replaceAll("&yuml;", "ÿ"); l = l.replaceAll("&quot;", "\""); l = l.replaceAll("&amp;", "&"); l = l.replaceAll("&lt;", "<"); l = l.replaceAll("&gt;", ">"); l = l.replaceAll("&\\w{3,8};", " "); } // end if (l.contains("&") && l.contains(";")) if ((l.contains("<u+00") || l.contains("<U+00")) && l.contains(">")) { l = l.replaceAll("<[uU]+0021>", " !"); l = l.replaceAll("<[uU]+0022>", " \""); l = l.replaceAll("<[uU]+0023>", " #"); l = l.replaceAll("<[uU]+0024>", " usd"); l = l.replaceAll("<[uU]+0025>", " %"); l = l.replaceAll("<[uU]+0026>", " &"); l = l.replaceAll("<[uU]+0027>", " \'"); l = l.replaceAll("<[uU]+0028>", " ("); l = l.replaceAll("<[uU]+0029>", " )"); l = l.replaceAll("<[uU]+002[aA]>", " *"); l = l.replaceAll("<[uU]+002[bB]>", " +"); l = l.replaceAll("<[uU]+002[cC]>", " ,"); l = l.replaceAll("<[uU]+002[dD]>", " -"); l = l.replaceAll("<[uU]+002[eE]>", " ."); l = l.replaceAll("<[uU]+002[fF]>", " /"); l = l.replaceAll("<[uU]+0030>", " 0"); l = l.replaceAll("<[uU]+0031>", " 1"); l = l.replaceAll("<[uU]+0032>", " 2"); l = l.replaceAll("<[uU]+0033>", " 3"); l = l.replaceAll("<[uU]+0034>", " 4"); l = l.replaceAll("<[uU]+0035>", " 5"); l = l.replaceAll("<[uU]+0036>", " 6"); l = l.replaceAll("<[uU]+0037>", " 7"); l = l.replaceAll("<[uU]+0038>", " 8"); l = l.replaceAll("<[uU]+0039>", " 9"); l = l.replaceAll("<[uU]+003[aA]>", " :"); l = l.replaceAll("<[uU]+003[bB]>", " ;"); l = l.replaceAll("<[uU]+003[cC]>", " <"); l = l.replaceAll("<[uU]+003[dD]>", " ="); l = l.replaceAll("<[uU]+003[eE]>", " >"); l = l.replaceAll("<[uU]+003[fF]>", " ?"); l = l.replaceAll("<[uU]+0040>", " @"); l = l.replaceAll("<[uU]+0041>", " A"); l = l.replaceAll("<[uU]+0042>", " B"); l = l.replaceAll("<[uU]+0043>", " C"); l = l.replaceAll("<[uU]+0044>", " D"); l = l.replaceAll("<[uU]+0045>", " E"); l = l.replaceAll("<[uU]+0046>", " F"); l = l.replaceAll("<[uU]+0047>", " G"); l = l.replaceAll("<[uU]+0048>", " H"); l = l.replaceAll("<[uU]+0049>", " I"); l = l.replaceAll("<[uU]+004[aA]>", " J"); l = l.replaceAll("<[uU]+004[bB]>", " K"); l = l.replaceAll("<[uU]+004[cC]>", " L"); l = l.replaceAll("<[uU]+004[dD]>", " M"); l = l.replaceAll("<[uU]+004[eE]>", " N"); l = l.replaceAll("<[uU]+004[fF]>", " O"); l = l.replaceAll("<[uU]+0050>", " P"); l = l.replaceAll("<[uU]+0051>", " Q"); l = l.replaceAll("<[uU]+0052>", " R"); l = l.replaceAll("<[uU]+0053>", " S"); l = l.replaceAll("<[uU]+0054>", " T"); l = l.replaceAll("<[uU]+0055>", " U"); l = l.replaceAll("<[uU]+0056>", " V"); l = l.replaceAll("<[uU]+0057>", " W"); l = l.replaceAll("<[uU]+0058>", " X"); l = l.replaceAll("<[uU]+0059>", " Y"); l = l.replaceAll("<[uU]+005[aA]>", " Z"); l = l.replaceAll("<[uU]+005[bB]>", " ("); l = l.replaceAll("<[uU]+005[cC]>", " \\"); l = l.replaceAll("<[uU]+005[dD]>", " )"); l = l.replaceAll("<[uU]+005[eE]>", " ^"); l = l.replaceAll("<[uU]+005[fF]>", " _"); l = l.replaceAll("<[uU]+0061>", " a"); l = l.replaceAll("<[uU]+0062>", " b"); l = l.replaceAll("<[uU]+0063>", " c"); l = l.replaceAll("<[uU]+0064>", " d"); l = l.replaceAll("<[uU]+0065>", " e"); l = l.replaceAll("<[uU]+0066>", " f"); l = l.replaceAll("<[uU]+0067>", " g"); l = l.replaceAll("<[uU]+0068>", " h"); l = l.replaceAll("<[uU]+0069>", " i"); l = l.replaceAll("<[uU]+006[aA]>", " j"); l = l.replaceAll("<[uU]+006[bB]>", " k"); l = l.replaceAll("<[uU]+006[cC]>", " l"); l = l.replaceAll("<[uU]+006[dD]>", " m"); l = l.replaceAll("<[uU]+006[eE]>", " n"); l = l.replaceAll("<[uU]+006[fF]>", " o"); l = l.replaceAll("<[uU]+0070>", " p"); l = l.replaceAll("<[uU]+0071>", " q"); l = l.replaceAll("<[uU]+0072>", " r"); l = l.replaceAll("<[uU]+0073>", " s"); l = l.replaceAll("<[uU]+0074>", " t"); l = l.replaceAll("<[uU]+0075>", " u"); l = l.replaceAll("<[uU]+0076>", " v"); l = l.replaceAll("<[uU]+0077>", " w"); l = l.replaceAll("<[uU]+0078>", " x"); l = l.replaceAll("<[uU]+0079>", " y"); l = l.replaceAll("<[uU]+007[aA]>", " z"); l = l.replaceAll("<[uU]+00[aA]1>", " ¡"); l = l.replaceAll("<[uU]+00[aA]2>", " cent"); l = l.replaceAll("<[uU]+00[aA]3>", " gbf"); l = l.replaceAll("<[uU]+00[aA]5>", " yen"); l = l.replaceAll("<[uU]+00[aA]7>", " §"); l = l.replaceAll("<[uU]+00[aA]9>", " copyright"); l = l.replaceAll("<[uU]+00[aA][aA]>", " ª"); l = l.replaceAll("<[uU]+00[aA][bB]>", " <<"); l = l.replaceAll("<[uU]+00[aA][dD]>", " ­"); l = l.replaceAll("<[uU]+00[aA][eE]>", " reg"); l = l.replaceAll("<[uU]+00[bB]0>", " °"); l = l.replaceAll("<[uU]+00[bB]1>", " +/-"); l = l.replaceAll("<[uU]+00[bB]2>", " ^2"); l = l.replaceAll("<[uU]+00[bB]3>", " ^3"); l = l.replaceAll("<[uU]+00[bB]5>", " micro"); l = l.replaceAll("<[uU]+00[bB]7>", " -"); l = l.replaceAll("<[uU]+00[bB]8>", " ,"); l = l.replaceAll("<[uU]+00[bB]9>", " ^1"); l = l.replaceAll("<[uU]+00[bB][aA]>", " º"); l = l.replaceAll("<[uU]+00[bB][bB]>", " >>"); l = l.replaceAll("<[uU]+00[bB][cC]>", " 1/4"); l = l.replaceAll("<[uU]+00[bB][dD]>", " 1/2"); l = l.replaceAll("<[uU]+00[bB][eE]>", " 3/4"); l = l.replaceAll("<[uU]+00[bB][fF]>", " ¿"); l = l.replaceAll("<[uU]+00[cC]0>", " À"); l = l.replaceAll("<[uU]+00[cC]1>", " Á"); l = l.replaceAll("<[uU]+00[cC]2>", " Â"); l = l.replaceAll("<[uU]+00[cC]3>", " Ã"); l = l.replaceAll("<[uU]+00[cC]4>", " Ä"); l = l.replaceAll("<[uU]+00[cC]5>", " Å"); l = l.replaceAll("<[uU]+00[cC]6>", " Æ"); l = l.replaceAll("<[uU]+00[cC]7>", " Ç"); l = l.replaceAll("<[uU]+00[cC]8>", " È"); l = l.replaceAll("<[uU]+00[cC]9>", " É"); l = l.replaceAll("<[uU]+00[cC][aA]>", " Ê"); l = l.replaceAll("<[uU]+00[cC][bB]>", " Ë"); l = l.replaceAll("<[uU]+00[cC][cC]>", " Ì"); l = l.replaceAll("<[uU]+00[cC][dD]>", " Í"); l = l.replaceAll("<[uU]+00[cC][eE]>", " Î"); l = l.replaceAll("<[uU]+00[cC][fF]>", " Ï"); l = l.replaceAll("<[uU]+00[dD]0>", " Ð"); l = l.replaceAll("<[uU]+00[dD]1>", " Ñ"); l = l.replaceAll("<[uU]+00[dD]2>", " Ò"); l = l.replaceAll("<[uU]+00[dD]3>", " Ó"); l = l.replaceAll("<[uU]+00[dD]4>", " Ô"); l = l.replaceAll("<[uU]+00[dD]5>", " Õ"); l = l.replaceAll("<[uU]+00[dD]6>", " Ö"); l = l.replaceAll("<[uU]+00[dD]7>", " ×"); l = l.replaceAll("<[uU]+00[dD]8>", " Ø"); l = l.replaceAll("<[uU]+00[dD]9>", " Ù"); l = l.replaceAll("<[uU]+00[dD][aA]>", " Ú"); l = l.replaceAll("<[uU]+00[dD][bB]>", " Û"); l = l.replaceAll("<[uU]+00[dD][cC]>", " Ü"); l = l.replaceAll("<[uU]+00[dD][dD]>", " Ý"); l = l.replaceAll("<[uU]+00[dD][eE]>", " Þ"); l = l.replaceAll("<[uU]+00[dD][fF]>", " ß"); l = l.replaceAll("<[uU]+00[eE]0>", " à"); l = l.replaceAll("<[uU]+00[eE]1>", " á"); l = l.replaceAll("<[uU]+00[eE]2>", " â"); l = l.replaceAll("<[uU]+00[eE]3>", " ã"); l = l.replaceAll("<[uU]+00[eE]4>", " ä"); l = l.replaceAll("<[uU]+00[eE]5>", " å"); l = l.replaceAll("<[uU]+00[eE]6>", " æ"); l = l.replaceAll("<[uU]+00[eE]7>", " ç"); l = l.replaceAll("<[uU]+00[eE]8>", " è"); l = l.replaceAll("<[uU]+00[eE]9>", " é"); l = l.replaceAll("<[uU]+00[eE][aA]>", " ê"); l = l.replaceAll("<[uU]+00[eE][bB]>", " ë"); l = l.replaceAll("<[uU]+00[eE][cC]>", " ì"); l = l.replaceAll("<[uU]+00[eE][dD]>", " í"); l = l.replaceAll("<[uU]+00[eE][eE]>", " î"); l = l.replaceAll("<[uU]+00[eE][fF]>", " ï"); l = l.replaceAll("<[uU]+00[fF]0>", " ð"); l = l.replaceAll("<[uU]+00[fF]1>", " ñ"); l = l.replaceAll("<[uU]+00[fF]2>", " ò"); l = l.replaceAll("<[uU]+00[fF]3>", " ó"); l = l.replaceAll("<[uU]+00[fF]4>", " ô"); l = l.replaceAll("<[uU]+00[fF]5>", " õ"); l = l.replaceAll("<[uU]+00[fF]6>", " ö"); l = l.replaceAll("<[uU]+00[fF]7>", " /"); l = l.replaceAll("<[uU]+00[fF]8>", " ø"); l = l.replaceAll("<[uU]+00[fF]9>", " ù"); l = l.replaceAll("<[uU]+00[fF][aA]>", " ú"); l = l.replaceAll("<[uU]+00[fF][bB]>", " û"); l = l.replaceAll("<[uU]+00[fF][cC]>", " ü"); l = l.replaceAll("<[uU]+00[fF][dD]>", " ý"); l = l.replaceAll("<[uU]+00[fF][eE]>", " þ"); l = l.replaceAll("<[uU]+00[fF][fF]>", " ÿ"); l = l.replaceAll("<[uU]+00\\w{2}>", " "); } // end if (l.contains("<[uU]+00") && l.contains(">")) l = l.replaceAll("&copy", "copyright"); }

i found StringEscapeUtils.unescapeHtml4(l) but it doesnt seem to prouce the same results as l.equals() was false.

What function can I use to escape those characters with a better performace (performance is very important in this case)?

Answer1:

You are potentially scanning the entire string several hundred times.

Don't.

Process it one character at a time to identify tokens you are interested in, and use a lookup table (HashMap) to convert each token to the desired output.

Answer2:

StringEscapeUtils unescape methods don't support the complete list of HTML4 and HTML5 named character references, nor the complete Unicode character set (only up to U+FFFF).

For HTML unescaping, I would recommend you to use <strong>Unbescape</strong> [ <a href="http://www.unbescape.org" rel="nofollow">http://www.unbescape.org</a> ], of which I'm author. All the HTML unescaping operations you are performing can be done with Unbescape as simply as:

final String result = HtmlEscape.unescapeHtml(l);

...and it will perform only one pass on your text, instead of hundreds of them as you are doing with each replaceAll call.

Besides HTML unescaping, however, you are performing strange translations like <U+0025> --> '&' or also things like 'Ãœ' --> 'ö'. Are these a requirement of your specific scenario? These are not HTML unescape operations... the <U+0025> ones are not HTML escapes and the Ãœ are basically wrongly-encoded strings, which problem is not how you write them but how you <em>read</em> them at the input stream (Strings in Java don't have an <em>encoding</em> -- instead encoding should be correctly set when reading them from or writing them to input/output streams).

Last, note that even among your <em>HTML unescaping</em> operations there are strange and non-standard ones, like '&#[xX]24;' --> 'usd' instead of '&#[xX]24;' --> '$'. Are these made on purpose?

Recommend

  • Error in encoding mysql -> How can I reconvert it to something else?
  • Should I do this with tables or divs?
  • img tag within title attribute
  • str_replace not replacing special chars
  • Converting special characters with htmlspecialchars and htmlentities
  • Behaviour of System.Timer when Interval property changed
  • How to use app.selection[0] for scripts in Adobe InDesign
  • Javascript replace “variables” in HTML code faster
  • Magento - Getting the Details of Multiple Products in a Single XML-RPC call
  • RavenDB OrderByDescending and Take - Incorrect Results
  • Can a structure tag be used before its scope?
  • Sharing classes between Server and Client projects in Silverlight
  • How to select sequential duplicates in SQL Server
  • Safe regexs to clean serialized DOM?
  • Why isn't RAD (Eclipse) able to properly detect WAS server startup?
  • Why are pure reducers so important in redux?
  • Avoid merging master into development branch
  • DDMS files not found: SDK_HOME/tools/traceview : Eclipse
  • How to get a Windows Forms panel as a bitmap with C#?
  • Why fgetc too slow?
  • Performance difference between accessing local and class member variables
  • Quickly find the min and max coordinates of connected component in a large image
  • What is the best Linux distro to work with Ruby on Rails? [closed]
  • Number of nodes returned in MutationRecord.addedNodes nodelist (mutationObserver)
  • Should I optimize around reads or CPU time in Google App Engine
  • Passing a Scala type to a function
  • Does anyone have a Categorized XML Corpus Reader for NLTK?
  • Returning this from a constructor function in JS
  • Visual studio 2015 keystroke with mouse button
  • setContentView() is not enough to switch between layouts?
  • How to merge keras sequential models with same input?
  • Time out Error in send mail
  • XSLT foreach repeating nodes to flat
  • what makes a request a new request in asp.net C#
  • Set focus to first invalid form element in AngularJS
  • Position: fixed nav does not stay fixed
  • Atlas images wrong size on iPad iOS 9
  • NetLogo BehaviorSpace - Measure runs using reporters
  • ActionScript 2 vs ActionScript 3 performance
  • java string with new operator and a literal