001//////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code for adherence to a set of rules. 003// Copyright (C) 2001-2018 the original author or authors. 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018//////////////////////////////////////////////////////////////////////////////// 019 020package com.puppycrawl.tools.checkstyle.checks; 021 022import java.util.List; 023import java.util.Map; 024import java.util.regex.Matcher; 025import java.util.regex.Pattern; 026 027import com.puppycrawl.tools.checkstyle.FileStatefulCheck; 028import com.puppycrawl.tools.checkstyle.api.AbstractCheck; 029import com.puppycrawl.tools.checkstyle.api.DetailAST; 030import com.puppycrawl.tools.checkstyle.api.TextBlock; 031import com.puppycrawl.tools.checkstyle.api.TokenTypes; 032import com.puppycrawl.tools.checkstyle.utils.CommonUtil; 033 034/** 035 * <p> 036 * Restrict using <a href = 037 * "https://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3"> 038 * Unicode escapes</a> (such as <code>\u221e</code>). 039 * It is possible to allow using escapes for 040 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 041 * non-printable(control) characters</a>. 042 * Also, this check can be configured to allow using escapes 043 * if trail comment is present. By the option it is possible to 044 * allow using escapes if literal contains only them. By the option it 045 * is possible to allow using escapes for space literals. 046 * </p> 047 * <p> 048 * Examples of using Unicode:</p> 049 * <pre> 050 * String unitAbbrev = "μs"; // Best: perfectly clear even without a comment. 051 * String unitAbbrev = "\u03bcs"; // Poor: the reader has no idea what this is. 052 * </pre> 053 * <p> 054 * An example of how to configure the check is: 055 * </p> 056 * <pre> 057 * <module name="AvoidEscapedUnicodeCharacters"/> 058 * </pre> 059 * <p> 060 * An example of non-printable(control) characters. 061 * </p> 062 * <pre> 063 * return '\ufeff' + content; // byte order mark 064 * </pre> 065 * <p> 066 * An example of how to configure the check to allow using escapes 067 * for non-printable(control) characters: 068 * </p> 069 * <pre> 070 * <module name="AvoidEscapedUnicodeCharacters"> 071 * <property name="allowEscapesForControlCharacters" value="true"/> 072 * </module> 073 * </pre> 074 * <p> 075 * Example of using escapes with trail comment: 076 * </p> 077 * <pre> 078 * String unitAbbrev = "\u03bcs"; // Greek letter mu, "s" 079 * </pre> 080 * <p>An example of how to configure the check to allow using escapes 081 * if trail comment is present: 082 * </p> 083 * <pre> 084 * <module name="AvoidEscapedUnicodeCharacters"> 085 * <property name="allowByTailComment" value="true"/> 086 * </module> 087 * </pre> 088 * <p>Example of using escapes if literal contains only them: 089 * </p> 090 * <pre> 091 * String unitAbbrev = "\u03bc\u03bc\u03bc"; 092 * </pre> 093 * <p>An example of how to configure the check to allow escapes 094 * if literal contains only them: 095 * </p> 096 * <pre> 097 * <module name="AvoidEscapedUnicodeCharacters"> 098 * <property name="allowIfAllCharactersEscaped" value="true"/> 099 * </module> 100 * </pre> 101 * <p>An example of how to configure the check to allow non-printable escapes: 102 * </p> 103 * <pre> 104 * <module name="AvoidEscapedUnicodeCharacters"> 105 * <property name="allowNonPrintableEscapes" value="true"/> 106 * </module> 107 * </pre> 108 * 109 * @noinspection HtmlTagCanBeJavadocTag 110 */ 111@FileStatefulCheck 112public class AvoidEscapedUnicodeCharactersCheck 113 extends AbstractCheck { 114 115 /** 116 * A key is pointing to the warning message text in "messages.properties" 117 * file. 118 */ 119 public static final String MSG_KEY = "forbid.escaped.unicode.char"; 120 121 /** Regular expression for Unicode chars. */ 122 private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}"); 123 124 /** 125 * Regular expression Unicode control characters. 126 * 127 * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 128 * Appendix:Control characters</a> 129 */ 130 private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\[uU]" 131 + "(00[0-1][0-9A-Fa-f]" 132 + "|00[8-9][0-9A-Fa-f]" 133 + "|00[aA][dD]" 134 + "|034[fF]" 135 + "|070[fF]" 136 + "|180[eE]" 137 + "|200[b-fB-F]" 138 + "|202[a-eA-E]" 139 + "|206[0-4a-fA-F]" 140 + "|[fF]{3}[9a-bA-B]" 141 + "|[fF][eE][fF]{2})"); 142 143 /** Regular expression for all escaped chars. */ 144 private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}" 145 + "|\"" 146 + "|\'" 147 + "|\\\\" 148 + "|\\\\b" 149 + "|\\\\f" 150 + "|\\\\n" 151 + "|\\\\r" 152 + "|\\\\t" 153 + ")+$"); 154 155 /** Regular expression for escaped backslash. */ 156 private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\"); 157 158 /** Regular expression for non-printable unicode chars. */ 159 private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000" 160 + "|\\\\u0009" 161 + "|\\\\u000[bB]" 162 + "|\\\\u000[cC]" 163 + "|\\\\u0020" 164 + "|\\\\u007[fF]" 165 + "|\\\\u0085" 166 + "|\\\\u009[fF]" 167 + "|\\\\u00[aA]0" 168 + "|\\\\u00[aA][dD]" 169 + "|\\\\u04[fF]9" 170 + "|\\\\u05[bB][eE]" 171 + "|\\\\u05[dD]0" 172 + "|\\\\u05[eE][aA]" 173 + "|\\\\u05[fF]3" 174 + "|\\\\u05[fF]4" 175 + "|\\\\u0600" 176 + "|\\\\u0604" 177 + "|\\\\u061[cC]" 178 + "|\\\\u06[dD]{2}" 179 + "|\\\\u06[fF]{2}" 180 + "|\\\\u070[fF]" 181 + "|\\\\u0750" 182 + "|\\\\u077[fF]" 183 + "|\\\\u0[eE]00" 184 + "|\\\\u0[eE]7[fF]" 185 + "|\\\\u1680" 186 + "|\\\\u180[eE]" 187 + "|\\\\u1[eE]00" 188 + "|\\\\u2000" 189 + "|\\\\u2001" 190 + "|\\\\u2002" 191 + "|\\\\u2003" 192 + "|\\\\u2004" 193 + "|\\\\u2005" 194 + "|\\\\u2006" 195 + "|\\\\u2007" 196 + "|\\\\u2008" 197 + "|\\\\u2009" 198 + "|\\\\u200[aA]" 199 + "|\\\\u200[fF]" 200 + "|\\\\u2025" 201 + "|\\\\u2028" 202 + "|\\\\u2029" 203 + "|\\\\u202[fF]" 204 + "|\\\\u205[fF]" 205 + "|\\\\u2064" 206 + "|\\\\u2066" 207 + "|\\\\u2067" 208 + "|\\\\u2068" 209 + "|\\\\u2069" 210 + "|\\\\u206[aA]" 211 + "|\\\\u206[fF]" 212 + "|\\\\u20[aA][fF]" 213 + "|\\\\u2100" 214 + "|\\\\u213[aA]" 215 + "|\\\\u3000" 216 + "|\\\\u[dD]800" 217 + "|\\\\u[fF]8[fF]{2}" 218 + "|\\\\u[fF][bB]50" 219 + "|\\\\u[fF][dD][fF]{2}" 220 + "|\\\\u[fF][eE]70" 221 + "|\\\\u[fF][eE][fF]{2}" 222 + "|\\\\u[fF]{2}0[eE]" 223 + "|\\\\u[fF]{2}61" 224 + "|\\\\u[fF]{2}[dD][cC]" 225 + "|\\\\u[fF]{3}9" 226 + "|\\\\u[fF]{3}[aA]" 227 + "|\\\\u[fF]{3}[bB]" 228 + "|\\\\u[fF]{4}"); 229 230 /** Cpp style comments. */ 231 private Map<Integer, TextBlock> singlelineComments; 232 /** C style comments. */ 233 private Map<Integer, List<TextBlock>> blockComments; 234 235 /** Allow use escapes for non-printable(control) characters. */ 236 private boolean allowEscapesForControlCharacters; 237 238 /** Allow use escapes if trail comment is present. */ 239 private boolean allowByTailComment; 240 241 /** Allow if all characters in literal are escaped. */ 242 private boolean allowIfAllCharactersEscaped; 243 244 /** Allow escapes for space literals. */ 245 private boolean allowNonPrintableEscapes; 246 247 /** 248 * Set allowIfAllCharactersEscaped. 249 * @param allow user's value. 250 */ 251 public final void setAllowEscapesForControlCharacters(boolean allow) { 252 allowEscapesForControlCharacters = allow; 253 } 254 255 /** 256 * Set allowByTailComment. 257 * @param allow user's value. 258 */ 259 public final void setAllowByTailComment(boolean allow) { 260 allowByTailComment = allow; 261 } 262 263 /** 264 * Set allowIfAllCharactersEscaped. 265 * @param allow user's value. 266 */ 267 public final void setAllowIfAllCharactersEscaped(boolean allow) { 268 allowIfAllCharactersEscaped = allow; 269 } 270 271 /** 272 * Set allowSpaceEscapes. 273 * @param allow user's value. 274 */ 275 public final void setAllowNonPrintableEscapes(boolean allow) { 276 allowNonPrintableEscapes = allow; 277 } 278 279 @Override 280 public int[] getDefaultTokens() { 281 return getRequiredTokens(); 282 } 283 284 @Override 285 public int[] getAcceptableTokens() { 286 return getRequiredTokens(); 287 } 288 289 @Override 290 public int[] getRequiredTokens() { 291 return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL}; 292 } 293 294 @Override 295 public void beginTree(DetailAST rootAST) { 296 singlelineComments = getFileContents().getSingleLineComments(); 297 blockComments = getFileContents().getBlockComments(); 298 } 299 300 @Override 301 public void visitToken(DetailAST ast) { 302 final String literal = ast.getText(); 303 304 if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast) 305 || isAllCharactersEscaped(literal) 306 || allowEscapesForControlCharacters 307 && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL) 308 || allowNonPrintableEscapes 309 && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) { 310 log(ast.getLineNo(), MSG_KEY); 311 } 312 } 313 314 /** 315 * Checks if literal has Unicode chars. 316 * @param literal String literal. 317 * @return true if literal has Unicode chars. 318 */ 319 private static boolean hasUnicodeChar(String literal) { 320 final String literalWithoutEscapedBackslashes = 321 ESCAPED_BACKSLASH.matcher(literal).replaceAll(""); 322 return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find(); 323 } 324 325 /** 326 * Check if String literal contains Unicode control chars. 327 * @param literal String literal. 328 * @param pattern RegExp for valid characters. 329 * @return true, if String literal contains Unicode control chars. 330 */ 331 private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) { 332 final int unicodeMatchesCounter = 333 countMatches(UNICODE_REGEXP, literal); 334 final int unicodeValidMatchesCounter = 335 countMatches(pattern, literal); 336 return unicodeMatchesCounter - unicodeValidMatchesCounter == 0; 337 } 338 339 /** 340 * Check if trail comment is present after ast token. 341 * @param ast current token. 342 * @return true if trail comment is present after ast token. 343 */ 344 private boolean hasTrailComment(DetailAST ast) { 345 boolean result = false; 346 final int lineNo = ast.getLineNo(); 347 if (singlelineComments.containsKey(lineNo)) { 348 result = true; 349 } 350 else { 351 final List<TextBlock> commentList = blockComments.get(lineNo); 352 if (commentList != null) { 353 final TextBlock comment = commentList.get(commentList.size() - 1); 354 final String line = getLines()[lineNo - 1]; 355 result = isTrailingBlockComment(comment, line); 356 } 357 } 358 return result; 359 } 360 361 /** 362 * Whether the C style comment is trailing. 363 * @param comment the comment to check. 364 * @param line the line where the comment starts. 365 * @return true if the comment is trailing. 366 */ 367 private static boolean isTrailingBlockComment(TextBlock comment, String line) { 368 return comment.getText().length != 1 369 || CommonUtil.isBlank(line.substring(comment.getEndColNo() + 1)); 370 } 371 372 /** 373 * Count regexp matches into String literal. 374 * @param pattern pattern. 375 * @param target String literal. 376 * @return count of regexp matches. 377 */ 378 private static int countMatches(Pattern pattern, String target) { 379 int matcherCounter = 0; 380 final Matcher matcher = pattern.matcher(target); 381 while (matcher.find()) { 382 matcherCounter++; 383 } 384 return matcherCounter; 385 } 386 387 /** 388 * Checks if all characters in String literal is escaped. 389 * @param literal current literal. 390 * @return true if all characters in String literal is escaped. 391 */ 392 private boolean isAllCharactersEscaped(String literal) { 393 return allowIfAllCharactersEscaped 394 && ALL_ESCAPED_CHARS.matcher(literal.substring(1, 395 literal.length() - 1)).find(); 396 } 397 398}