001/* 002 Licensed to the Apache Software Foundation (ASF) under one 003 or more contributor license agreements. See the NOTICE file 004 distributed with this work for additional information 005 regarding copyright ownership. The ASF licenses this file 006 to you under the Apache License, Version 2.0 (the 007 "License"); you may not use this file except in compliance 008 with the License. You may obtain a copy of the License at 009 010 http://www.apache.org/licenses/LICENSE-2.0 011 012 Unless required by applicable law or agreed to in writing, 013 software distributed under the License is distributed on an 014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 KIND, either express or implied. See the License for the 016 specific language governing permissions and limitations 017 under the License. 018 */ 019package org.apache.wiki.filters; 020 021import net.sf.akismet.Akismet; 022import org.apache.commons.lang3.time.StopWatch; 023import org.apache.log4j.Logger; 024import org.apache.oro.text.regex.MalformedPatternException; 025import org.apache.oro.text.regex.MatchResult; 026import org.apache.oro.text.regex.Pattern; 027import org.apache.oro.text.regex.PatternCompiler; 028import org.apache.oro.text.regex.PatternMatcher; 029import org.apache.oro.text.regex.Perl5Compiler; 030import org.apache.oro.text.regex.Perl5Matcher; 031import org.apache.wiki.InternalWikiException; 032import org.apache.wiki.api.core.Attachment; 033import org.apache.wiki.api.core.Context; 034import org.apache.wiki.api.core.ContextEnum; 035import org.apache.wiki.api.core.Engine; 036import org.apache.wiki.api.core.Page; 037import org.apache.wiki.api.exceptions.ProviderException; 038import org.apache.wiki.api.exceptions.RedirectException; 039import org.apache.wiki.api.filters.BasePageFilter; 040import org.apache.wiki.api.providers.WikiProvider; 041import org.apache.wiki.attachment.AttachmentManager; 042import org.apache.wiki.auth.user.UserProfile; 043import org.apache.wiki.pages.PageManager; 044import org.apache.wiki.ui.EditorManager; 045import org.apache.wiki.util.FileUtil; 046import org.apache.wiki.util.HttpUtil; 047import org.apache.wiki.util.TextUtil; 048import org.suigeneris.jrcs.diff.Diff; 049import org.suigeneris.jrcs.diff.DifferentiationFailedException; 050import org.suigeneris.jrcs.diff.Revision; 051import org.suigeneris.jrcs.diff.delta.AddDelta; 052import org.suigeneris.jrcs.diff.delta.ChangeDelta; 053import org.suigeneris.jrcs.diff.delta.DeleteDelta; 054import org.suigeneris.jrcs.diff.delta.Delta; 055import org.suigeneris.jrcs.diff.myers.MyersDiff; 056 057import javax.servlet.http.HttpServletRequest; 058import javax.servlet.http.HttpServletResponse; 059import javax.servlet.jsp.PageContext; 060import java.io.BufferedReader; 061import java.io.IOException; 062import java.io.InputStream; 063import java.io.InputStreamReader; 064import java.io.StringReader; 065import java.io.StringWriter; 066import java.nio.charset.StandardCharsets; 067import java.util.ArrayList; 068import java.util.Collection; 069import java.util.Date; 070import java.util.Iterator; 071import java.util.Properties; 072import java.util.Random; 073import java.util.StringTokenizer; 074import java.util.Vector; 075import java.util.concurrent.ThreadLocalRandom; 076 077 078/** 079 * This is Herb, the JSPWiki spamfilter that can also do choke modifications. 080 * 081 * Parameters: 082 * <ul> 083 * <li>wordlist - Page name where the spamword regexps are found. Use [{SET spamwords='regexp list separated with spaces'}] on 084 * that page. Default is "SpamFilterWordList". 085 * <li>IPlist - Page name where the IP regexps are found. Use [{SET ips='regexp list separated with spaces'}] on 086 * that page. Default is "SpamFilterIPList". 087 * <li>maxpagenamelength - Maximum page name length. Default is 100. 088 * <li>blacklist - The name of an attachment containing the list of spam patterns, one per line. Default is 089 * "SpamFilterWordList/blacklist.txt"</li> 090 * <li>errorpage - The page to which the user is redirected. Has a special variable $msg which states the reason. Default is "RejectedMessage". 091 * <li>pagechangesinminute - How many page changes are allowed/minute. Default is 5.</li> 092 * <li>similarchanges - How many similar page changes are allowed before the host is banned. Default is 2. (since 2.4.72)</li> 093 * <li>bantime - How long an IP address stays on the temporary ban list (default is 60 for 60 minutes).</li> 094 * <li>maxurls - How many URLs can be added to the page before it is considered spam (default is 5)</li> 095 * <li>akismet-apikey - The Akismet API key (see akismet.org)</li> 096 * <li>ignoreauthenticated - If set to "true", all authenticated users are ignored and never caught in SpamFilter</li> 097 * <li>captcha - Sets the captcha technology to use. Current allowed values are "none" and "asirra".</li> 098 * <li>strategy - Sets the filtering strategy to use. If set to "eager", will stop at the first probable 099 * match, and won't consider any other tests. This is the default, as it's considerably lighter. If set to "score", will go through all of the tests 100 * and calculates a score for the spam, which is then compared to a filter level value. 101 * </ul> 102 * 103 * <p>Please see the default editors/plain.jsp for examples on how the SpamFilter integrates 104 * with the editor system.</p> 105 * 106 * <p>Changes by admin users are ignored in any case.</p> 107 * 108 * @since 2.1.112 109 */ 110public class SpamFilter extends BasePageFilter { 111 112 private static final String ATTR_SPAMFILTER_SCORE = "spamfilter.score"; 113 private static final String REASON_REGEXP = "Regexp"; 114 private static final String REASON_IP_BANNED_TEMPORARILY = "IPBannedTemporarily"; 115 private static final String REASON_IP_BANNED_PERMANENTLY = "IPBannedPermanently"; 116 private static final String REASON_BOT_TRAP = "BotTrap"; 117 private static final String REASON_AKISMET = "Akismet"; 118 private static final String REASON_TOO_MANY_URLS = "TooManyUrls"; 119 private static final String REASON_SIMILAR_MODIFICATIONS = "SimilarModifications"; 120 private static final String REASON_TOO_MANY_MODIFICATIONS = "TooManyModifications"; 121 private static final String REASON_PAGENAME_TOO_LONG = "PageNameTooLong"; 122 private static final String REASON_UTF8_TRAP = "UTF8Trap"; 123 124 private static final String LISTVAR = "spamwords"; 125 private static final String LISTIPVAR = "ips"; 126 127 private static final Random RANDOM = ThreadLocalRandom.current(); 128 129 /** The filter property name for specifying the page which contains the list of spamwords. Value is <tt>{@value}</tt>. */ 130 public static final String PROP_WORDLIST = "wordlist"; 131 132 /** The filter property name for specifying the page which contains the list of IPs to ban. Value is <tt>{@value}</tt>. */ 133 public static final String PROP_IPLIST = "IPlist"; 134 135 /** The filter property name for specifying the maximum page name length. Value is <tt>{@value}</tt>. */ 136 public static final String PROP_MAX_PAGENAME_LENGTH = "maxpagenamelength"; 137 138 /** The filter property name for the page to which you are directed if Herb rejects your edit. Value is <tt>{@value}</tt>. */ 139 public static final String PROP_ERRORPAGE = "errorpage"; 140 141 /** The filter property name for specifying how many changes is any given IP address 142 * allowed to do per minute. Value is <tt>{@value}</tt>. 143 */ 144 public static final String PROP_PAGECHANGES = "pagechangesinminute"; 145 146 /** The filter property name for specifying how many similar changes are allowed before a host is banned. Value is <tt>{@value}</tt>. */ 147 public static final String PROP_SIMILARCHANGES = "similarchanges"; 148 149 /** The filter property name for specifying how long a host is banned. Value is <tt>{@value}</tt>.*/ 150 public static final String PROP_BANTIME = "bantime"; 151 152 /** The filter property name for the attachment containing the blacklist. Value is <tt>{@value}</tt>.*/ 153 public static final String PROP_BLACKLIST = "blacklist"; 154 155 /** The filter property name for specifying how many URLs can any given edit contain. Value is <tt>{@value}</tt> */ 156 public static final String PROP_MAXURLS = "maxurls"; 157 158 /** The filter property name for specifying the Akismet API-key. Value is <tt>{@value}</tt>. */ 159 public static final String PROP_AKISMET_API_KEY = "akismet-apikey"; 160 161 /** The filter property name for specifying whether authenticated users should be ignored. Value is <tt>{@value}</tt>. */ 162 public static final String PROP_IGNORE_AUTHENTICATED = "ignoreauthenticated"; 163 164 /** The filter property name for specifying which captcha technology should be used. Value is <tt>{@value}</tt>. */ 165 public static final String PROP_CAPTCHA = "captcha"; 166 167 /** The filter property name for specifying which filter strategy should be used. Value is <tt>{@value}</tt>. */ 168 public static final String PROP_FILTERSTRATEGY = "strategy"; 169 170 /** The string specifying the "eager" strategy. Value is <tt>{@value}</tt>. */ 171 public static final String STRATEGY_EAGER = "eager"; 172 173 /** The string specifying the "score" strategy. Value is <tt>{@value}</tt>. */ 174 public static final String STRATEGY_SCORE = "score"; 175 176 private static final String URL_REGEXP = "(http://|https://|mailto:)([A-Za-z0-9_/\\.\\+\\?\\#\\-\\@=&;]+)"; 177 178 private String m_forbiddenWordsPage = "SpamFilterWordList"; 179 private String m_forbiddenIPsPage = "SpamFilterIPList"; 180 private String m_pageNameMaxLength = "100"; 181 private String m_errorPage = "RejectedMessage"; 182 private String m_blacklist = "SpamFilterWordList/blacklist.txt"; 183 184 private PatternMatcher m_matcher = new Perl5Matcher(); 185 private PatternCompiler m_compiler = new Perl5Compiler(); 186 187 private Collection<Pattern> m_spamPatterns = null; 188 private Collection<Pattern> m_IPPatterns = null; 189 190 private Date m_lastRebuild = new Date( 0L ); 191 192 private static Logger c_spamlog = Logger.getLogger( "SpamLog" ); 193 private static Logger log = Logger.getLogger( SpamFilter.class ); 194 195 private Vector<Host> m_temporaryBanList = new Vector<>(); 196 197 private int m_banTime = 60; // minutes 198 199 private Vector<Host> m_lastModifications = new Vector<>(); 200 201 /** How many times a single IP address can change a page per minute? */ 202 private int m_limitSinglePageChanges = 5; 203 204 /** How many times can you add the exact same string to a page? */ 205 private int m_limitSimilarChanges = 2; 206 207 /** How many URLs can be added at maximum. */ 208 private int m_maxUrls = 10; 209 210 private Pattern m_urlPattern; 211 private Akismet m_akismet; 212 213 private String m_akismetAPIKey = null; 214 215 private boolean m_useCaptcha = false; 216 217 /** The limit at which we consider something to be spam. */ 218 private int m_scoreLimit = 1; 219 220 /** If set to true, will ignore anyone who is in Authenticated role. */ 221 private boolean m_ignoreAuthenticated = false; 222 223 private boolean m_stopAtFirstMatch = true; 224 225 private static String c_hashName; 226 private static long c_lastUpdate; 227 228 /** The HASH_DELAY value is a maximum amount of time that an user can keep 229 * a session open, because after the value has expired, we will invent a new 230 * hash field name. By default this is {@value} hours, which should be ample 231 * time for someone. 232 */ 233 private static final long HASH_DELAY = 24; 234 235 236 /** 237 * {@inheritDoc} 238 */ 239 @Override 240 public void initialize( final Engine engine, final Properties properties ) { 241 m_forbiddenWordsPage = properties.getProperty( PROP_WORDLIST, m_forbiddenWordsPage ); 242 m_forbiddenIPsPage = properties.getProperty( PROP_IPLIST, m_forbiddenIPsPage); 243 m_pageNameMaxLength = properties.getProperty( PROP_MAX_PAGENAME_LENGTH, m_pageNameMaxLength); 244 m_errorPage = properties.getProperty( PROP_ERRORPAGE, m_errorPage ); 245 m_limitSinglePageChanges = TextUtil.getIntegerProperty( properties, PROP_PAGECHANGES, m_limitSinglePageChanges ); 246 247 m_limitSimilarChanges = TextUtil.getIntegerProperty( properties, PROP_SIMILARCHANGES, m_limitSimilarChanges ); 248 249 m_maxUrls = TextUtil.getIntegerProperty( properties, PROP_MAXURLS, m_maxUrls ); 250 m_banTime = TextUtil.getIntegerProperty( properties, PROP_BANTIME, m_banTime ); 251 m_blacklist = properties.getProperty( PROP_BLACKLIST, m_blacklist ); 252 253 m_ignoreAuthenticated = TextUtil.getBooleanProperty( properties, PROP_IGNORE_AUTHENTICATED, m_ignoreAuthenticated ); 254 255 m_useCaptcha = properties.getProperty( PROP_CAPTCHA, "" ).equals("asirra"); 256 257 try { 258 m_urlPattern = m_compiler.compile( URL_REGEXP ); 259 } catch( final MalformedPatternException e ) { 260 log.fatal( "Internal error: Someone put in a faulty pattern.", e ); 261 throw new InternalWikiException( "Faulty pattern." , e); 262 } 263 264 m_akismetAPIKey = TextUtil.getStringProperty( properties, PROP_AKISMET_API_KEY, m_akismetAPIKey ); 265 m_stopAtFirstMatch = TextUtil.getStringProperty( properties, PROP_FILTERSTRATEGY, STRATEGY_EAGER ).equals( STRATEGY_EAGER ); 266 267 log.info( "# Spam filter initialized. Temporary ban time " + m_banTime + 268 " mins, max page changes/minute: " + m_limitSinglePageChanges ); 269 } 270 271 private static final int REJECT = 0; 272 private static final int ACCEPT = 1; 273 private static final int NOTE = 2; 274 275 private static String log( final Context ctx, final int type, final String source, String message ) { 276 message = TextUtil.replaceString( message, "\r\n", "\\r\\n" ); 277 message = TextUtil.replaceString( message, "\"", "\\\"" ); 278 279 final String uid = getUniqueID(); 280 281 final String page = ctx.getPage().getName(); 282 String reason = "UNKNOWN"; 283 final String addr = ctx.getHttpRequest() != null ? HttpUtil.getRemoteAddress( ctx.getHttpRequest() ) : "-"; 284 285 switch( type ) { 286 case REJECT: 287 reason = "REJECTED"; 288 break; 289 case ACCEPT: 290 reason = "ACCEPTED"; 291 break; 292 case NOTE: 293 reason = "NOTE"; 294 break; 295 default: 296 throw new InternalWikiException( "Illegal type " + type ); 297 } 298 c_spamlog.info( reason + " " + source + " " + uid + " " + addr + " \"" + page + "\" " + message ); 299 300 return uid; 301 } 302 303 /** {@inheritDoc} */ 304 @Override 305 public String preSave( final Context context, final String content ) throws RedirectException { 306 cleanBanList(); 307 refreshBlacklists( context ); 308 final Change change = getChange( context, content ); 309 310 if( !ignoreThisUser( context ) ) { 311 checkBanList( context, change ); 312 checkSinglePageChange( context, content, change ); 313 checkIPList( context ); 314 checkPatternList( context, content, change ); 315 checkPageName( context, content, change); 316 } 317 318 if( !m_stopAtFirstMatch ) { 319 final Integer score = context.getVariable( ATTR_SPAMFILTER_SCORE ); 320 321 if( score != null && score.intValue() >= m_scoreLimit ) { 322 throw new RedirectException( "Herb says you got too many points", getRedirectPage( context ) ); 323 } 324 } 325 326 log( context, ACCEPT, "-", change.toString() ); 327 return content; 328 } 329 330 private void checkPageName( final Context context, final String content, final Change change) throws RedirectException { 331 final Page page = context.getPage(); 332 final String pageName = page.getName(); 333 final int maxlength = Integer.valueOf(m_pageNameMaxLength); 334 if ( pageName.length() > maxlength) { 335 // 336 // Spam filter has a match. 337 // 338 339 final String uid = log( context, REJECT, REASON_PAGENAME_TOO_LONG + "(" + m_pageNameMaxLength + ")" , pageName); 340 341 log.info("SPAM:PageNameTooLong (" + uid + "). The length of the page name is too large (" + pageName.length() + " , limit is " + m_pageNameMaxLength + ")"); 342 checkStrategy( context, REASON_PAGENAME_TOO_LONG, "Herb says '" + pageName + "' is a bad pageName and I trust Herb! (Incident code " + uid + ")" ); 343 344 } 345 } 346 347 private void checkStrategy( final Context context, final String error, final String message ) throws RedirectException { 348 if( m_stopAtFirstMatch ) { 349 throw new RedirectException( message, getRedirectPage( context ) ); 350 } 351 352 Integer score = context.getVariable( ATTR_SPAMFILTER_SCORE ); 353 if( score != null ) { 354 score = score + 1; 355 } else { 356 score = 1; 357 } 358 359 context.setVariable( ATTR_SPAMFILTER_SCORE, score ); 360 } 361 362 /** 363 * Parses a list of patterns and returns a Collection of compiled Pattern 364 * objects. 365 * 366 * @param source 367 * @param list 368 * @return A Collection of the Patterns that were found from the lists. 369 */ 370 private Collection< Pattern > parseWordList( final Page source, final String list ) { 371 final ArrayList< Pattern > compiledpatterns = new ArrayList<>(); 372 373 if( list != null ) { 374 final StringTokenizer tok = new StringTokenizer( list, " \t\n" ); 375 376 while( tok.hasMoreTokens() ) { 377 final String pattern = tok.nextToken(); 378 379 try { 380 compiledpatterns.add( m_compiler.compile( pattern ) ); 381 } catch( final MalformedPatternException e ) { 382 log.debug( "Malformed spam filter pattern " + pattern ); 383 source.setAttribute("error", "Malformed spam filter pattern " + pattern); 384 } 385 } 386 } 387 388 return compiledpatterns; 389 } 390 391 /** 392 * Takes a MT-Blacklist -formatted blacklist and returns a list of compiled Pattern objects. 393 * 394 * @param list 395 * @return The parsed blacklist patterns. 396 */ 397 private Collection< Pattern > parseBlacklist( final String list ) { 398 final ArrayList< Pattern > compiledpatterns = new ArrayList<>(); 399 400 if( list != null ) { 401 try { 402 final BufferedReader in = new BufferedReader( new StringReader(list) ); 403 String line; 404 while( (line = in.readLine() ) != null ) { 405 line = line.trim(); 406 if( line.length() == 0 ) continue; // Empty line 407 if( line.startsWith("#") ) continue; // It's a comment 408 409 int ws = line.indexOf( ' ' ); 410 if( ws == -1 ) ws = line.indexOf( '\t' ); 411 if( ws != -1 ) line = line.substring( 0, ws ); 412 413 try { 414 compiledpatterns.add( m_compiler.compile( line ) ); 415 } catch( final MalformedPatternException e ) { 416 log.debug( "Malformed spam filter pattern " + line ); 417 } 418 } 419 } catch( final IOException e ) { 420 log.info( "Could not read patterns; returning what I got" , e ); 421 } 422 } 423 424 return compiledpatterns; 425 } 426 427 /** 428 * Takes a single page change and performs a load of tests on the content change. 429 * An admin can modify anything. 430 * 431 * @param context 432 * @param content 433 * @throws RedirectException 434 */ 435 private synchronized void checkSinglePageChange( final Context context, final String content, final Change change ) 436 throws RedirectException { 437 final HttpServletRequest req = context.getHttpRequest(); 438 439 if( req != null ) { 440 final String addr = HttpUtil.getRemoteAddress( req ); 441 int hostCounter = 0; 442 int changeCounter = 0; 443 444 log.debug( "Change is " + change.m_change ); 445 446 final long time = System.currentTimeMillis() - 60*1000L; // 1 minute 447 448 for( final Iterator< Host > i = m_lastModifications.iterator(); i.hasNext(); ) { 449 final Host host = i.next(); 450 451 // Check if this item is invalid 452 if( host.getAddedTime() < time ) { 453 log.debug( "Removed host " + host.getAddress() + " from modification queue (expired)" ); 454 i.remove(); 455 continue; 456 } 457 458 // Check if this IP address has been seen before 459 if( host.getAddress().equals( addr ) ) { 460 hostCounter++; 461 } 462 463 // Check, if this change has been seen before 464 if( host.getChange() != null && host.getChange().equals( change ) ) { 465 changeCounter++; 466 } 467 } 468 469 // Now, let's check against the limits. 470 if( hostCounter >= m_limitSinglePageChanges ) { 471 final Host host = new Host( addr, null ); 472 m_temporaryBanList.add( host ); 473 474 final String uid = log( context, REJECT, REASON_TOO_MANY_MODIFICATIONS, change.m_change ); 475 log.info( "SPAM:TooManyModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many modifications/minute" ); 476 checkStrategy( context, REASON_TOO_MANY_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" ); 477 } 478 479 if( changeCounter >= m_limitSimilarChanges ) { 480 final Host host = new Host( addr, null ); 481 m_temporaryBanList.add( host ); 482 483 final String uid = log( context, REJECT, REASON_SIMILAR_MODIFICATIONS, change.m_change ); 484 log.info( "SPAM:SimilarModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many similar modifications" ); 485 checkStrategy( context, REASON_SIMILAR_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code "+uid+")"); 486 } 487 488 // Calculate the number of links in the addition. 489 String tstChange = change.toString(); 490 int urlCounter = 0; 491 while( m_matcher.contains( tstChange,m_urlPattern ) ) { 492 final MatchResult m = m_matcher.getMatch(); 493 tstChange = tstChange.substring( m.endOffset(0) ); 494 urlCounter++; 495 } 496 497 if( urlCounter > m_maxUrls ) { 498 final Host host = new Host( addr, null ); 499 m_temporaryBanList.add( host ); 500 501 final String uid = log( context, REJECT, REASON_TOO_MANY_URLS, change.toString() ); 502 log.info( "SPAM:TooManyUrls (" + uid + "). Added host " + addr + " to temporary ban list for adding too many URLs" ); 503 checkStrategy( context, REASON_TOO_MANY_URLS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" ); 504 } 505 506 // Check bot trap 507 checkBotTrap( context, change ); 508 509 // Check UTF-8 mangling 510 checkUTF8( context, change ); 511 512 // Do Akismet check. This is good to be the last, because this is the most expensive operation. 513 checkAkismet( context, change ); 514 515 m_lastModifications.add( new Host( addr, change ) ); 516 } 517 } 518 519 520 /** 521 * Checks against the akismet system. 522 * 523 * @param context 524 * @param change 525 * @throws RedirectException 526 */ 527 private void checkAkismet( final Context context, final Change change ) throws RedirectException { 528 if( m_akismetAPIKey != null ) { 529 if( m_akismet == null ) { 530 log.info( "Initializing Akismet spam protection." ); 531 m_akismet = new Akismet( m_akismetAPIKey, context.getEngine().getBaseURL() ); 532 533 if( !m_akismet.verifyAPIKey() ) { 534 log.error( "Akismet API key cannot be verified. Please check your config." ); 535 m_akismetAPIKey = null; 536 m_akismet = null; 537 } 538 } 539 540 final HttpServletRequest req = context.getHttpRequest(); 541 542 // Akismet will mark all empty statements as spam, so we'll just ignore them. 543 if( change.m_adds == 0 && change.m_removals > 0 ) { 544 return; 545 } 546 547 if( req != null && m_akismet != null ) { 548 log.debug( "Calling Akismet to check for spam..." ); 549 550 final StopWatch sw = new StopWatch(); 551 sw.start(); 552 553 final String ipAddress = HttpUtil.getRemoteAddress( req ); 554 final String userAgent = req.getHeader( "User-Agent" ); 555 final String referrer = req.getHeader( "Referer"); 556 final String permalink = context.getViewURL( context.getPage().getName() ); 557 final String commentType = context.getRequestContext().equals( ContextEnum.PAGE_COMMENT.getRequestContext() ) ? "comment" : "edit"; 558 final String commentAuthor = context.getCurrentUser().getName(); 559 final String commentAuthorEmail = null; 560 final String commentAuthorURL = null; 561 562 final boolean isSpam = m_akismet.commentCheck( ipAddress, 563 userAgent, 564 referrer, 565 permalink, 566 commentType, 567 commentAuthor, 568 commentAuthorEmail, 569 commentAuthorURL, 570 change.toString(), 571 null ); 572 573 sw.stop(); 574 log.debug( "Akismet request done in: " + sw ); 575 576 if( isSpam ) { 577 // Host host = new Host( ipAddress, null ); 578 // m_temporaryBanList.add( host ); 579 580 final String uid = log( context, REJECT, REASON_AKISMET, change.toString() ); 581 log.info( "SPAM:Akismet (" + uid + "). Akismet thinks this change is spam; added host to temporary ban list." ); 582 checkStrategy( context, REASON_AKISMET, "Akismet tells Herb you're a spammer, Herb trusts Akismet, and I trust Herb! (Incident code " + uid + ")" ); 583 } 584 } 585 } 586 } 587 588 /** 589 * Returns a static string which can be used to detect spambots which just wildly fill in all the fields. 590 * 591 * @return A string 592 */ 593 public static String getBotFieldName() { 594 return "submit_auth"; 595 } 596 597 /** 598 * This checks whether an invisible field is available in the request, and whether it's contents are suspected spam. 599 * 600 * @param context 601 * @param change 602 * @throws RedirectException 603 */ 604 private void checkBotTrap( final Context context, final Change change ) throws RedirectException { 605 final HttpServletRequest request = context.getHttpRequest(); 606 if( request != null ) { 607 final String unspam = request.getParameter( getBotFieldName() ); 608 if( unspam != null && unspam.length() > 0 ) { 609 final String uid = log( context, REJECT, REASON_BOT_TRAP, change.toString() ); 610 611 log.info( "SPAM:BotTrap (" + uid + "). Wildly behaving bot detected." ); 612 checkStrategy( context, REASON_BOT_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" ); 613 } 614 } 615 } 616 617 private void checkUTF8( final Context context, final Change change ) throws RedirectException { 618 final HttpServletRequest request = context.getHttpRequest(); 619 if( request != null ) { 620 final String utf8field = request.getParameter( "encodingcheck" ); 621 if( utf8field != null && !utf8field.equals( "\u3041" ) ) { 622 final String uid = log( context, REJECT, REASON_UTF8_TRAP, change.toString() ); 623 624 log.info( "SPAM:UTF8Trap (" + uid + "). Wildly posting dumb bot detected." ); 625 checkStrategy( context, REASON_UTF8_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" ); 626 } 627 } 628 } 629 630 /** Goes through the ban list and cleans away any host which has expired from it. */ 631 private synchronized void cleanBanList() { 632 final long now = System.currentTimeMillis(); 633 for( final Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) { 634 final Host host = i.next(); 635 636 if( host.getReleaseTime() < now ) { 637 log.debug( "Removed host " + host.getAddress() + " from temporary ban list (expired)" ); 638 i.remove(); 639 } 640 } 641 } 642 643 /** 644 * Checks the ban list if the IP address of the changer is already on it. 645 * 646 * @param context 647 * @throws RedirectException 648 */ 649 private void checkBanList( final Context context, final Change change ) throws RedirectException { 650 final HttpServletRequest req = context.getHttpRequest(); 651 652 if( req != null ) { 653 final String remote = HttpUtil.getRemoteAddress(req); 654 final long now = System.currentTimeMillis(); 655 656 for( final Host host : m_temporaryBanList ) { 657 if( host.getAddress().equals( remote ) ) { 658 final long timeleft = ( host.getReleaseTime() - now ) / 1000L; 659 660 log( context, REJECT, REASON_IP_BANNED_TEMPORARILY, change.m_change ); 661 checkStrategy( context, REASON_IP_BANNED_TEMPORARILY, 662 "You have been temporarily banned from modifying this wiki. (" + timeleft + " seconds of ban left)" ); 663 } 664 } 665 } 666 } 667 668 /** 669 * If the spam filter notices changes in the black list page, it will refresh them automatically. 670 * 671 * @param context associated WikiContext 672 */ 673 private void refreshBlacklists( final Context context ) { 674 try { 675 boolean rebuild = false; 676 677 // Rebuild, if the spam words page, the attachment or the IP ban page has changed since. 678 final Page sourceSpam = context.getEngine().getManager( PageManager.class ).getPage( m_forbiddenWordsPage ); 679 if( sourceSpam != null ) { 680 if( m_spamPatterns == null || m_spamPatterns.isEmpty() || sourceSpam.getLastModified().after( m_lastRebuild ) ) { 681 rebuild = true; 682 } 683 } 684 685 final Attachment att = context.getEngine().getManager( AttachmentManager.class ).getAttachmentInfo( context, m_blacklist ); 686 if( att != null ) { 687 if( m_spamPatterns == null || m_spamPatterns.isEmpty() || att.getLastModified().after( m_lastRebuild ) ) { 688 rebuild = true; 689 } 690 } 691 692 final Page sourceIPs = context.getEngine().getManager( PageManager.class ).getPage( m_forbiddenIPsPage ); 693 if( sourceIPs != null ) { 694 if( m_IPPatterns == null || m_IPPatterns.isEmpty() || sourceIPs.getLastModified().after( m_lastRebuild ) ) { 695 rebuild = true; 696 } 697 } 698 699 // Do the actual rebuilding. For simplicity's sake, we always rebuild the complete filter list regardless of what changed. 700 if( rebuild ) { 701 m_lastRebuild = new Date(); 702 m_spamPatterns = parseWordList( sourceSpam, ( sourceSpam != null ) ? sourceSpam.getAttribute( LISTVAR ) : null ); 703 704 log.info( "Spam filter reloaded - recognizing " + m_spamPatterns.size() + " patterns from page " + m_forbiddenWordsPage ); 705 706 m_IPPatterns = parseWordList( sourceIPs, ( sourceIPs != null ) ? sourceIPs.getAttribute( LISTIPVAR ) : null ); 707 log.info( "IP filter reloaded - recognizing " + m_IPPatterns.size() + " patterns from page " + m_forbiddenIPsPage ); 708 709 if( att != null ) { 710 final InputStream in = context.getEngine().getManager( AttachmentManager.class ).getAttachmentStream(att); 711 final StringWriter out = new StringWriter(); 712 FileUtil.copyContents( new InputStreamReader( in, StandardCharsets.UTF_8 ), out ); 713 final Collection< Pattern > blackList = parseBlacklist( out.toString() ); 714 log.info( "...recognizing additional " + blackList.size() + " patterns from blacklist " + m_blacklist ); 715 m_spamPatterns.addAll( blackList ); 716 } 717 } 718 } catch( final IOException ex ) { 719 log.info( "Unable to read attachment data, continuing...", ex ); 720 } catch( final ProviderException ex ) { 721 log.info( "Failed to read spam filter attachment, continuing...", ex ); 722 } 723 } 724 725 /** 726 * Does a check against a known pattern list. 727 * 728 * @param context 729 * @param content 730 * @param change 731 * @throws RedirectException 732 */ 733 private void checkPatternList( final Context context, final String content, final Change change ) throws RedirectException { 734 // If we have no spam patterns defined, or we're trying to save the page containing the patterns, just return. 735 if( m_spamPatterns == null || context.getPage().getName().equals( m_forbiddenWordsPage ) ) { 736 return; 737 } 738 739 String ch = change.toString(); 740 if( context.getHttpRequest() != null ) { 741 ch += HttpUtil.getRemoteAddress( context.getHttpRequest() ); 742 } 743 744 for( final Pattern p : m_spamPatterns ) { 745 // log.debug("Attempting to match page contents with "+p.getPattern()); 746 747 if( m_matcher.contains( ch, p ) ) { 748 // Spam filter has a match. 749 final String uid = log( context, REJECT, REASON_REGEXP + "(" + p.getPattern() + ")", ch ); 750 751 log.info( "SPAM:Regexp (" + uid + "). Content matches the spam filter '" + p.getPattern() + "'" ); 752 checkStrategy( context, REASON_REGEXP, "Herb says '" + p.getPattern() + "' is a bad spam word and I trust Herb! (Incident code " + uid + ")" ); 753 } 754 } 755 } 756 757 758 /** 759 * Does a check against a pattern list of IPs. 760 * 761 * @param context 762 * @throws RedirectException 763 */ 764 private void checkIPList( final Context context ) throws RedirectException { 765 // If we have no IP patterns defined, or we're trying to save the page containing the IP patterns, just return. 766 if( m_IPPatterns == null || context.getPage().getName().equals( m_forbiddenIPsPage ) ) { 767 return; 768 } 769 770 final String remoteIP = HttpUtil.getRemoteAddress( context.getHttpRequest() ); 771 log.info("Attempting to match remoteIP " + remoteIP + " against " + m_IPPatterns.size() + " patterns"); 772 773 for( final Pattern p : m_IPPatterns ) { 774 log.debug("Attempting to match remoteIP with " + p.getPattern()); 775 776 if( m_matcher.contains( remoteIP, p ) ) { 777 778 // IP filter has a match. 779 // 780 final String uid = log( context, REJECT, REASON_IP_BANNED_PERMANENTLY + "(" + p.getPattern() + ")", remoteIP ); 781 782 log.info( "SPAM:IPBanList (" + uid + "). remoteIP matches the IP filter '" + p.getPattern() + "'" ); 783 checkStrategy( context, REASON_IP_BANNED_PERMANENTLY, "Herb says '" + p.getPattern() + "' is a banned IP and I trust Herb! (Incident code " + uid + ")" ); 784 } 785 } 786 } 787 788 private void checkPatternList( final Context context, final String content, final String change ) throws RedirectException { 789 final Change c = new Change(); 790 c.m_change = change; 791 checkPatternList( context, content, c ); 792 } 793 794 /** 795 * Creates a simple text string describing the added content. 796 * 797 * @param context 798 * @param newText 799 * @return Empty string, if there is no change. 800 */ 801 private static Change getChange( final Context context, final String newText ) { 802 final Page page = context.getPage(); 803 final StringBuffer change = new StringBuffer(); 804 final Engine engine = context.getEngine(); 805 // Get current page version 806 807 final Change ch = new Change(); 808 809 try { 810 final String oldText = engine.getManager( PageManager.class ).getPureText( page.getName(), WikiProvider.LATEST_VERSION ); 811 final String[] first = Diff.stringToArray( oldText ); 812 final String[] second = Diff.stringToArray( newText ); 813 final Revision rev = Diff.diff( first, second, new MyersDiff() ); 814 815 if( rev == null || rev.size() == 0 ) { 816 return ch; 817 } 818 819 for( int i = 0; i < rev.size(); i++ ) { 820 final Delta d = rev.getDelta( i ); 821 822 if( d instanceof AddDelta ) { 823 d.getRevised().toString( change, "", "\r\n" ); 824 ch.m_adds++; 825 826 } else if( d instanceof ChangeDelta ) { 827 d.getRevised().toString( change, "", "\r\n" ); 828 ch.m_adds++; 829 830 } else if( d instanceof DeleteDelta ) { 831 ch.m_removals++; 832 } 833 } 834 } catch( final DifferentiationFailedException e ) { 835 log.error( "Diff failed", e ); 836 } 837 838 // Don't forget to include the change note, too 839 final String changeNote = page.getAttribute( Page.CHANGENOTE ); 840 if( changeNote != null ) { 841 change.append( "\r\n" ); 842 change.append( changeNote ); 843 } 844 845 // And author as well 846 if( page.getAuthor() != null ) { 847 change.append( "\r\n" + page.getAuthor() ); 848 } 849 850 ch.m_change = change.toString(); 851 return ch; 852 } 853 854 /** 855 * Returns true, if this user should be ignored. For example, admin users. 856 * 857 * @param context 858 * @return True, if this users should be ignored. 859 */ 860 private boolean ignoreThisUser( final Context context ) { 861 if( context.hasAdminPermissions() ) { 862 return true; 863 } 864 865 if( m_ignoreAuthenticated && context.getWikiSession().isAuthenticated() ) { 866 return true; 867 } 868 869 if( context.getVariable( "captcha" ) != null ) { 870 return true; 871 } 872 873 return false; 874 } 875 876 /** 877 * Returns a random string of six uppercase characters. 878 * 879 * @return A random string 880 */ 881 private static String getUniqueID() { 882 final StringBuilder sb = new StringBuilder(); 883 for( int i = 0; i < 6; i++ ) { 884 final char x = ( char )( 'A' + RANDOM.nextInt( 26 ) ); 885 sb.append( x ); 886 } 887 888 return sb.toString(); 889 } 890 891 /** 892 * Returns a page to which we shall redirect, based on the current value of the "captcha" parameter. 893 * 894 * @param ctx WikiContext 895 * @return An URL to redirect to 896 */ 897 private String getRedirectPage( final Context ctx ) { 898 if( m_useCaptcha ) { 899 return ctx.getURL( ContextEnum.PAGE_NONE.getRequestContext(), "Captcha.jsp", "page= " +ctx.getEngine().encodeName( ctx.getPage().getName() ) ); 900 } 901 902 return ctx.getURL( ContextEnum.PAGE_VIEW.getRequestContext(), m_errorPage ); 903 } 904 905 /** 906 * Checks whether the UserProfile matches certain checks. 907 * 908 * @param profile The profile to check 909 * @param context The WikiContext 910 * @return False, if this userprofile is suspect and should not be allowed to be added. 911 * @since 2.6.1 912 */ 913 public boolean isValidUserProfile( final Context context, final UserProfile profile ) { 914 try { 915 checkPatternList( context, profile.getEmail(), profile.getEmail() ); 916 checkPatternList( context, profile.getFullname(), profile.getFullname() ); 917 checkPatternList( context, profile.getLoginName(), profile.getLoginName() ); 918 } catch( final RedirectException e ) { 919 log.info("Detected attempt to create a spammer user account (see above for rejection reason)"); 920 return false; 921 } 922 923 return true; 924 } 925 926 /** 927 * This method is used to calculate an unique code when submitting the page to detect edit conflicts. 928 * It currently incorporates the last-modified date of the page, and the IP address of the submitter. 929 * 930 * @param page The WikiPage under edit 931 * @param request The HTTP Request 932 * @since 2.6 933 * @return A hash value for this page and session 934 */ 935 public static final String getSpamHash( final Page page, final HttpServletRequest request ) { 936 long lastModified = 0; 937 938 if( page.getLastModified() != null ) { 939 lastModified = page.getLastModified().getTime(); 940 } 941 final long remote = HttpUtil.getRemoteAddress( request ).hashCode(); 942 943 return Long.toString( lastModified ^ remote ); 944 } 945 946 /** 947 * Returns the name of the hash field to be used in this request. The value is unique per session, and once 948 * the session has expired, you cannot edit anymore. 949 * 950 * @param request The page request 951 * @return The name to be used in the hash field 952 * @since 2.6 953 */ 954 public static final String getHashFieldName( final HttpServletRequest request ) { 955 String hash = null; 956 957 if( request.getSession() != null ) { 958 hash = ( String )request.getSession().getAttribute( "_hash" ); 959 960 if( hash == null ) { 961 hash = c_hashName; 962 request.getSession().setAttribute( "_hash", hash ); 963 } 964 } 965 966 if( c_hashName == null || c_lastUpdate < ( System.currentTimeMillis() - HASH_DELAY * 60 * 60 * 1000 ) ) { 967 c_hashName = getUniqueID().toLowerCase(); 968 c_lastUpdate = System.currentTimeMillis(); 969 } 970 971 return hash != null ? hash : c_hashName; 972 } 973 974 975 /** 976 * This method checks if the hash value is still valid, i.e. if it exists at all. This can occur in two cases: 977 * either this is a spam bot which is not adaptive, or it is someone who has been editing one page for too long, 978 * and their session has expired. 979 * <p> 980 * This method puts a redirect to the http response field to page "SessionExpired" and logs the incident in 981 * the spam log (it may or may not be spam, but it's rather likely that it is). 982 * 983 * @param context The WikiContext 984 * @param pageContext The JSP PageContext. 985 * @return True, if hash is okay. False, if hash is not okay, and you need to redirect. 986 * @throws IOException If redirection fails 987 * @since 2.6 988 */ 989 public static final boolean checkHash( final Context context, final PageContext pageContext ) throws IOException { 990 final String hashName = getHashFieldName( (HttpServletRequest)pageContext.getRequest() ); 991 if( pageContext.getRequest().getParameter(hashName) == null ) { 992 if( pageContext.getAttribute( hashName ) == null ) { 993 final Change change = getChange( context, EditorManager.getEditedText( pageContext ) ); 994 log( context, REJECT, "MissingHash", change.m_change ); 995 996 final String redirect = context.getURL( ContextEnum.PAGE_VIEW.getRequestContext(),"SessionExpired" ); 997 ( ( HttpServletResponse )pageContext.getResponse() ).sendRedirect( redirect ); 998 return false; 999 } 1000 } 1001 1002 return true; 1003 } 1004 1005 /** 1006 * This helper method adds all the input fields to your editor that the SpamFilter requires 1007 * to check for spam. This <i>must</i> be in your editor form if you intend to use the SpamFilter. 1008 * 1009 * @param pageContext The PageContext 1010 * @return A HTML string which contains input fields for the SpamFilter. 1011 */ 1012 public static final String insertInputFields( final PageContext pageContext ) { 1013 final Context ctx = Context.findContext( pageContext ); 1014 final Engine engine = ctx.getEngine(); 1015 final StringBuilder sb = new StringBuilder(); 1016 if( engine.getContentEncoding().equals( StandardCharsets.UTF_8 ) ) { 1017 sb.append( "<input name='encodingcheck' type='hidden' value='\u3041' />\n" ); 1018 } 1019 1020 return sb.toString(); 1021 } 1022 1023 /** 1024 * A local class for storing host information. 1025 * 1026 * @since 1027 */ 1028 private class Host { 1029 1030 private long m_addedTime = System.currentTimeMillis(); 1031 private long m_releaseTime; 1032 private String m_address; 1033 private Change m_change; 1034 1035 public String getAddress() { 1036 return m_address; 1037 } 1038 1039 public long getReleaseTime() { 1040 return m_releaseTime; 1041 } 1042 1043 public long getAddedTime() { 1044 return m_addedTime; 1045 } 1046 1047 public Change getChange() { 1048 return m_change; 1049 } 1050 1051 public Host( final String ipaddress, final Change change ) { 1052 m_address = ipaddress; 1053 m_change = change; 1054 m_releaseTime = System.currentTimeMillis() + m_banTime * 60 * 1000L; 1055 } 1056 1057 } 1058 1059 private static class Change { 1060 1061 public String m_change; 1062 public int m_adds; 1063 public int m_removals; 1064 1065 @Override public String toString() { 1066 return m_change; 1067 } 1068 1069 @Override public boolean equals( final Object o ) { 1070 if( o instanceof Change ) { 1071 return m_change.equals( ( ( Change )o ).m_change ); 1072 } 1073 return false; 1074 } 1075 1076 @Override public int hashCode() { 1077 return m_change.hashCode() + 17; 1078 } 1079 1080 } 1081 1082}