001/*
002    Licensed to the Apache Software Foundation (ASF) under one
003    or more contributor license agreements.  See the NOTICE file
004    distributed with this work for additional information
005    regarding copyright ownership.  The ASF licenses this file
006    to you under the Apache License, Version 2.0 (the
007    "License"); you may not use this file except in compliance
008    with the License.  You may obtain a copy of the License at
009
010       http://www.apache.org/licenses/LICENSE-2.0
011
012    Unless required by applicable law or agreed to in writing,
013    software distributed under the License is distributed on an
014    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015    KIND, either express or implied.  See the License for the
016    specific language governing permissions and limitations
017    under the License.  
018 */
019package org.apache.wiki.filters;
020
021import net.sf.akismet.Akismet;
022import org.apache.commons.lang3.time.StopWatch;
023import org.apache.log4j.Logger;
024import org.apache.oro.text.regex.MalformedPatternException;
025import org.apache.oro.text.regex.MatchResult;
026import org.apache.oro.text.regex.Pattern;
027import org.apache.oro.text.regex.PatternCompiler;
028import org.apache.oro.text.regex.PatternMatcher;
029import org.apache.oro.text.regex.Perl5Compiler;
030import org.apache.oro.text.regex.Perl5Matcher;
031import org.apache.wiki.InternalWikiException;
032import org.apache.wiki.api.core.Attachment;
033import org.apache.wiki.api.core.Context;
034import org.apache.wiki.api.core.ContextEnum;
035import org.apache.wiki.api.core.Engine;
036import org.apache.wiki.api.core.Page;
037import org.apache.wiki.api.exceptions.ProviderException;
038import org.apache.wiki.api.exceptions.RedirectException;
039import org.apache.wiki.api.filters.BasePageFilter;
040import org.apache.wiki.api.providers.WikiProvider;
041import org.apache.wiki.attachment.AttachmentManager;
042import org.apache.wiki.auth.user.UserProfile;
043import org.apache.wiki.pages.PageManager;
044import org.apache.wiki.ui.EditorManager;
045import org.apache.wiki.util.FileUtil;
046import org.apache.wiki.util.HttpUtil;
047import org.apache.wiki.util.TextUtil;
048import org.suigeneris.jrcs.diff.Diff;
049import org.suigeneris.jrcs.diff.DifferentiationFailedException;
050import org.suigeneris.jrcs.diff.Revision;
051import org.suigeneris.jrcs.diff.delta.AddDelta;
052import org.suigeneris.jrcs.diff.delta.ChangeDelta;
053import org.suigeneris.jrcs.diff.delta.DeleteDelta;
054import org.suigeneris.jrcs.diff.delta.Delta;
055import org.suigeneris.jrcs.diff.myers.MyersDiff;
056
057import javax.servlet.http.HttpServletRequest;
058import javax.servlet.http.HttpServletResponse;
059import javax.servlet.jsp.PageContext;
060import java.io.BufferedReader;
061import java.io.IOException;
062import java.io.InputStream;
063import java.io.InputStreamReader;
064import java.io.StringReader;
065import java.io.StringWriter;
066import java.nio.charset.StandardCharsets;
067import java.util.ArrayList;
068import java.util.Collection;
069import java.util.Date;
070import java.util.Iterator;
071import java.util.Properties;
072import java.util.Random;
073import java.util.StringTokenizer;
074import java.util.Vector;
075import java.util.concurrent.ThreadLocalRandom;
076
077
078/**
079 *  This is Herb, the JSPWiki spamfilter that can also do choke modifications.
080 *
081 *  Parameters:
082 *  <ul>
083 *    <li>wordlist - Page name where the spamword regexps are found.  Use [{SET spamwords='regexp list separated with spaces'}] on
084 *     that page.  Default is "SpamFilterWordList".
085 *    <li>IPlist - Page name where the IP regexps are found.  Use [{SET ips='regexp list separated with spaces'}] on
086 *     that page.  Default is "SpamFilterIPList".
087 *    <li>maxpagenamelength - Maximum page name length. Default is 100.
088 *    <li>blacklist - The name of an attachment containing the list of spam patterns, one per line. Default is
089 *        "SpamFilterWordList/blacklist.txt"</li>
090 *    <li>errorpage - The page to which the user is redirected.  Has a special variable $msg which states the reason. Default is "RejectedMessage".
091 *    <li>pagechangesinminute - How many page changes are allowed/minute.  Default is 5.</li>
092 *    <li>similarchanges - How many similar page changes are allowed before the host is banned.  Default is 2.  (since 2.4.72)</li>
093 *    <li>bantime - How long an IP address stays on the temporary ban list (default is 60 for 60 minutes).</li>
094 *    <li>maxurls - How many URLs can be added to the page before it is considered spam (default is 5)</li>
095 *    <li>akismet-apikey - The Akismet API key (see akismet.org)</li>
096 *    <li>ignoreauthenticated - If set to "true", all authenticated users are ignored and never caught in SpamFilter</li>
097 *    <li>captcha - Sets the captcha technology to use.  Current allowed values are "none" and "asirra".</li>
098 *    <li>strategy - Sets the filtering strategy to use.  If set to "eager", will stop at the first probable
099 *        match, and won't consider any other tests.  This is the default, as it's considerably lighter. If set to "score", will go through all of the tests
100 *        and calculates a score for the spam, which is then compared to a filter level value.
101 *  </ul>
102 *
103 *  <p>Please see the default editors/plain.jsp for examples on how the SpamFilter integrates
104 *  with the editor system.</p>
105 *  
106 *  <p>Changes by admin users are ignored in any case.</p>
107 *
108 *  @since 2.1.112
109 */
110public class SpamFilter extends BasePageFilter {
111    
112    private static final String ATTR_SPAMFILTER_SCORE = "spamfilter.score";
113    private static final String REASON_REGEXP = "Regexp";
114    private static final String REASON_IP_BANNED_TEMPORARILY = "IPBannedTemporarily";
115    private static final String REASON_IP_BANNED_PERMANENTLY = "IPBannedPermanently";
116    private static final String REASON_BOT_TRAP = "BotTrap";
117    private static final String REASON_AKISMET = "Akismet";
118    private static final String REASON_TOO_MANY_URLS = "TooManyUrls";
119    private static final String REASON_SIMILAR_MODIFICATIONS = "SimilarModifications";
120    private static final String REASON_TOO_MANY_MODIFICATIONS = "TooManyModifications";
121    private static final String REASON_PAGENAME_TOO_LONG = "PageNameTooLong";
122    private static final String REASON_UTF8_TRAP = "UTF8Trap";
123
124    private static final String LISTVAR = "spamwords";
125    private static final String LISTIPVAR = "ips";
126
127    private static final Random RANDOM = ThreadLocalRandom.current();
128
129    /** The filter property name for specifying the page which contains the list of spamwords. Value is <tt>{@value}</tt>. */
130    public static final String  PROP_WORDLIST              = "wordlist";
131
132    /** The filter property name for specifying the page which contains the list of IPs to ban. Value is <tt>{@value}</tt>. */
133    public static final String  PROP_IPLIST                = "IPlist";
134
135    /** The filter property name for specifying the maximum page name length.  Value is <tt>{@value}</tt>. */
136    public static final String  PROP_MAX_PAGENAME_LENGTH   = "maxpagenamelength";
137
138    /** The filter property name for the page to which you are directed if Herb rejects your edit.  Value is <tt>{@value}</tt>. */
139    public static final String  PROP_ERRORPAGE             = "errorpage";
140    
141    /** The filter property name for specifying how many changes is any given IP address
142     *  allowed to do per minute.  Value is <tt>{@value}</tt>.
143     */
144    public static final String  PROP_PAGECHANGES           = "pagechangesinminute";
145    
146    /** The filter property name for specifying how many similar changes are allowed before a host is banned.  Value is <tt>{@value}</tt>. */
147    public static final String  PROP_SIMILARCHANGES        = "similarchanges";
148    
149    /** The filter property name for specifying how long a host is banned.  Value is <tt>{@value}</tt>.*/
150    public static final String  PROP_BANTIME               = "bantime";
151    
152    /** The filter property name for the attachment containing the blacklist.  Value is <tt>{@value}</tt>.*/
153    public static final String  PROP_BLACKLIST             = "blacklist";
154    
155    /** The filter property name for specifying how many URLs can any given edit contain. Value is <tt>{@value}</tt> */
156    public static final String  PROP_MAXURLS               = "maxurls";
157    
158    /** The filter property name for specifying the Akismet API-key.  Value is <tt>{@value}</tt>. */
159    public static final String  PROP_AKISMET_API_KEY       = "akismet-apikey";
160    
161    /** The filter property name for specifying whether authenticated users should be ignored. Value is <tt>{@value}</tt>. */
162    public static final String  PROP_IGNORE_AUTHENTICATED  = "ignoreauthenticated";
163    
164    /** The filter property name for specifying which captcha technology should be used. Value is <tt>{@value}</tt>. */
165    public static final String  PROP_CAPTCHA               = "captcha";
166    
167    /** The filter property name for specifying which filter strategy should be used.  Value is <tt>{@value}</tt>. */
168    public static final String  PROP_FILTERSTRATEGY        = "strategy";
169
170    /** The string specifying the "eager" strategy. Value is <tt>{@value}</tt>. */
171    public static final String  STRATEGY_EAGER             = "eager";
172    
173    /** The string specifying the "score" strategy. Value is <tt>{@value}</tt>. */
174    public static final String  STRATEGY_SCORE             = "score";
175
176    private static final String URL_REGEXP = "(http://|https://|mailto:)([A-Za-z0-9_/\\.\\+\\?\\#\\-\\@=&;]+)";
177
178    private String          m_forbiddenWordsPage = "SpamFilterWordList";
179    private String          m_forbiddenIPsPage   = "SpamFilterIPList";
180    private String          m_pageNameMaxLength  = "100";
181    private String          m_errorPage          = "RejectedMessage";
182    private String          m_blacklist          = "SpamFilterWordList/blacklist.txt";
183
184    private PatternMatcher  m_matcher = new Perl5Matcher();
185    private PatternCompiler m_compiler = new Perl5Compiler();
186
187    private Collection<Pattern> m_spamPatterns = null;
188    private Collection<Pattern> m_IPPatterns = null;
189
190    private Date            m_lastRebuild = new Date( 0L );
191
192    private static  Logger  c_spamlog = Logger.getLogger( "SpamLog" );
193    private static  Logger  log = Logger.getLogger( SpamFilter.class );
194
195    private Vector<Host>    m_temporaryBanList = new Vector<>();
196
197    private int             m_banTime = 60; // minutes
198
199    private Vector<Host>    m_lastModifications = new Vector<>();
200
201    /** How many times a single IP address can change a page per minute? */
202    private int             m_limitSinglePageChanges = 5;
203
204    /** How many times can you add the exact same string to a page? */
205    private int             m_limitSimilarChanges = 2;
206
207    /** How many URLs can be added at maximum. */
208    private int             m_maxUrls = 10;
209
210    private Pattern         m_urlPattern;
211    private Akismet         m_akismet;
212
213    private String          m_akismetAPIKey = null;
214
215    private boolean         m_useCaptcha = false;
216
217    /** The limit at which we consider something to be spam. */
218    private int             m_scoreLimit = 1;
219
220    /** If set to true, will ignore anyone who is in Authenticated role. */
221    private boolean         m_ignoreAuthenticated = false;
222
223    private boolean         m_stopAtFirstMatch = true;
224
225    private static String   c_hashName;
226    private static long     c_lastUpdate;
227
228    /** The HASH_DELAY value is a maximum amount of time that an user can keep
229     *  a session open, because after the value has expired, we will invent a new
230     *  hash field name.  By default this is {@value} hours, which should be ample
231     *  time for someone.
232     */
233    private static final long HASH_DELAY = 24;
234
235
236    /**
237     *  {@inheritDoc}
238     */
239    @Override
240    public void initialize( final Engine engine, final Properties properties ) {
241        m_forbiddenWordsPage = properties.getProperty( PROP_WORDLIST, m_forbiddenWordsPage );
242        m_forbiddenIPsPage = properties.getProperty( PROP_IPLIST, m_forbiddenIPsPage);
243        m_pageNameMaxLength = properties.getProperty( PROP_MAX_PAGENAME_LENGTH, m_pageNameMaxLength);
244        m_errorPage = properties.getProperty( PROP_ERRORPAGE, m_errorPage );
245        m_limitSinglePageChanges = TextUtil.getIntegerProperty( properties, PROP_PAGECHANGES, m_limitSinglePageChanges );
246        
247        m_limitSimilarChanges = TextUtil.getIntegerProperty( properties, PROP_SIMILARCHANGES, m_limitSimilarChanges );
248
249        m_maxUrls = TextUtil.getIntegerProperty( properties, PROP_MAXURLS, m_maxUrls );
250        m_banTime = TextUtil.getIntegerProperty( properties, PROP_BANTIME, m_banTime );
251        m_blacklist = properties.getProperty( PROP_BLACKLIST, m_blacklist );
252
253        m_ignoreAuthenticated = TextUtil.getBooleanProperty( properties, PROP_IGNORE_AUTHENTICATED, m_ignoreAuthenticated );
254
255        m_useCaptcha = properties.getProperty( PROP_CAPTCHA, "" ).equals("asirra");
256
257        try {
258            m_urlPattern = m_compiler.compile( URL_REGEXP );
259        } catch( final MalformedPatternException e ) {
260            log.fatal( "Internal error: Someone put in a faulty pattern.", e );
261            throw new InternalWikiException( "Faulty pattern." , e);
262        }
263
264        m_akismetAPIKey = TextUtil.getStringProperty( properties, PROP_AKISMET_API_KEY, m_akismetAPIKey );
265        m_stopAtFirstMatch = TextUtil.getStringProperty( properties, PROP_FILTERSTRATEGY, STRATEGY_EAGER ).equals( STRATEGY_EAGER );
266
267        log.info( "# Spam filter initialized.  Temporary ban time " + m_banTime +
268                  " mins, max page changes/minute: " + m_limitSinglePageChanges );
269    }
270
271    private static final int REJECT = 0;
272    private static final int ACCEPT = 1;
273    private static final int NOTE   = 2;
274
275    private static String log( final Context ctx, final int type, final String source, String message ) {
276        message = TextUtil.replaceString( message, "\r\n", "\\r\\n" );
277        message = TextUtil.replaceString( message, "\"", "\\\"" );
278
279        final String uid = getUniqueID();
280
281        final String page   = ctx.getPage().getName();
282        String reason = "UNKNOWN";
283        final String addr   = ctx.getHttpRequest() != null ? HttpUtil.getRemoteAddress( ctx.getHttpRequest() ) : "-";
284
285        switch( type ) {
286            case REJECT:
287                reason = "REJECTED";
288                break;
289            case ACCEPT:
290                reason = "ACCEPTED";
291                break;
292            case NOTE:
293                reason = "NOTE";
294                break;
295            default:
296                throw new InternalWikiException( "Illegal type " + type );
297        }
298        c_spamlog.info( reason + " " + source + " " + uid + " " + addr + " \"" + page + "\" " + message );
299
300        return uid;
301    }
302
303    /** {@inheritDoc} */
304    @Override
305    public String preSave( final Context context, final String content ) throws RedirectException {
306        cleanBanList();
307        refreshBlacklists( context );
308        final Change change = getChange( context, content );
309
310        if( !ignoreThisUser( context ) ) {
311            checkBanList( context, change );
312            checkSinglePageChange( context, content, change );
313            checkIPList( context );
314            checkPatternList( context, content, change );
315            checkPageName( context, content, change);
316        }
317
318        if( !m_stopAtFirstMatch ) {
319            final Integer score = context.getVariable( ATTR_SPAMFILTER_SCORE );
320
321            if( score != null && score.intValue() >= m_scoreLimit ) {
322                throw new RedirectException( "Herb says you got too many points", getRedirectPage( context ) );
323            }
324        }
325
326        log( context, ACCEPT, "-", change.toString() );
327        return content;
328    }
329
330    private void checkPageName( final Context context, final String content, final Change change) throws RedirectException {
331        final Page page = context.getPage();
332        final String pageName = page.getName();
333        final int maxlength = Integer.valueOf(m_pageNameMaxLength);
334        if ( pageName.length() > maxlength) {
335            //
336            //  Spam filter has a match.
337            //
338
339            final String uid = log( context, REJECT, REASON_PAGENAME_TOO_LONG + "(" + m_pageNameMaxLength + ")" , pageName);
340
341            log.info("SPAM:PageNameTooLong (" + uid + "). The length of the page name is too large (" + pageName.length() + " , limit is " + m_pageNameMaxLength + ")");
342            checkStrategy( context, REASON_PAGENAME_TOO_LONG, "Herb says '" + pageName + "' is a bad pageName and I trust Herb! (Incident code " + uid + ")" );
343
344        }
345    }
346
347    private void checkStrategy( final Context context, final String error, final String message ) throws RedirectException {
348        if( m_stopAtFirstMatch ) {
349            throw new RedirectException( message, getRedirectPage( context ) );
350        }
351
352        Integer score = context.getVariable( ATTR_SPAMFILTER_SCORE );
353        if( score != null ) {
354            score = score + 1;
355        } else {
356            score = 1;
357        }
358
359        context.setVariable( ATTR_SPAMFILTER_SCORE, score );
360    }
361    
362    /**
363     *  Parses a list of patterns and returns a Collection of compiled Pattern
364     *  objects.
365     *
366     * @param source
367     * @param list
368     * @return A Collection of the Patterns that were found from the lists.
369     */
370    private Collection< Pattern > parseWordList( final Page source, final String list ) {
371        final ArrayList< Pattern > compiledpatterns = new ArrayList<>();
372
373        if( list != null ) {
374            final StringTokenizer tok = new StringTokenizer( list, " \t\n" );
375
376            while( tok.hasMoreTokens() ) {
377                final String pattern = tok.nextToken();
378
379                try {
380                    compiledpatterns.add( m_compiler.compile( pattern ) );
381                } catch( final MalformedPatternException e ) {
382                    log.debug( "Malformed spam filter pattern " + pattern );
383                    source.setAttribute("error", "Malformed spam filter pattern " + pattern);
384                }
385            }
386        }
387
388        return compiledpatterns;
389    }
390
391    /**
392     *  Takes a MT-Blacklist -formatted blacklist and returns a list of compiled Pattern objects.
393     *
394     *  @param list
395     *  @return The parsed blacklist patterns.
396     */
397    private Collection< Pattern > parseBlacklist( final String list ) {
398        final ArrayList< Pattern > compiledpatterns = new ArrayList<>();
399
400        if( list != null ) {
401            try {
402                final BufferedReader in = new BufferedReader( new StringReader(list) );
403                String line;
404                while( (line = in.readLine() ) != null ) {
405                    line = line.trim();
406                    if( line.length() == 0 ) continue; // Empty line
407                    if( line.startsWith("#") ) continue; // It's a comment
408
409                    int ws = line.indexOf( ' ' );
410                    if( ws == -1 ) ws = line.indexOf( '\t' );
411                    if( ws != -1 ) line = line.substring( 0, ws );
412
413                    try {
414                        compiledpatterns.add( m_compiler.compile( line ) );
415                    } catch( final MalformedPatternException e ) {
416                        log.debug( "Malformed spam filter pattern " + line );
417                    }
418                }
419            } catch( final IOException e ) {
420                log.info( "Could not read patterns; returning what I got" , e );
421            }
422        }
423
424        return compiledpatterns;
425    }
426
427    /**
428     *  Takes a single page change and performs a load of tests on the content change.
429     *  An admin can modify anything.
430     *
431     *  @param context
432     *  @param content
433     *  @throws RedirectException
434     */
435    private synchronized void checkSinglePageChange( final Context context, final String content, final Change change )
436            throws RedirectException {
437        final HttpServletRequest req = context.getHttpRequest();
438
439        if( req != null ) {
440            final String addr = HttpUtil.getRemoteAddress( req );
441            int hostCounter = 0;
442            int changeCounter = 0;
443
444            log.debug( "Change is " + change.m_change );
445
446            final long time = System.currentTimeMillis() - 60*1000L; // 1 minute
447
448            for( final Iterator< Host > i = m_lastModifications.iterator(); i.hasNext(); ) {
449                final Host host = i.next();
450
451                //  Check if this item is invalid
452                if( host.getAddedTime() < time ) {
453                    log.debug( "Removed host " + host.getAddress() + " from modification queue (expired)" );
454                    i.remove();
455                    continue;
456                }
457
458                // Check if this IP address has been seen before
459                if( host.getAddress().equals( addr ) ) {
460                    hostCounter++;
461                }
462
463                //  Check, if this change has been seen before
464                if( host.getChange() != null && host.getChange().equals( change ) ) {
465                    changeCounter++;
466                }
467            }
468
469            //  Now, let's check against the limits.
470            if( hostCounter >= m_limitSinglePageChanges ) {
471                final Host host = new Host( addr, null );
472                m_temporaryBanList.add( host );
473
474                final String uid = log( context, REJECT, REASON_TOO_MANY_MODIFICATIONS, change.m_change );
475                log.info( "SPAM:TooManyModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many modifications/minute" );
476                checkStrategy( context, REASON_TOO_MANY_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" );
477            }
478
479            if( changeCounter >= m_limitSimilarChanges ) {
480                final Host host = new Host( addr, null );
481                m_temporaryBanList.add( host );
482
483                final String uid = log( context, REJECT, REASON_SIMILAR_MODIFICATIONS, change.m_change );
484                log.info( "SPAM:SimilarModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many similar modifications" );
485                checkStrategy( context, REASON_SIMILAR_MODIFICATIONS, "Herb says you look like a spammer, and I trust Herb! (Incident code "+uid+")");
486            }
487
488            //  Calculate the number of links in the addition.
489            String tstChange  = change.toString();
490            int urlCounter = 0;
491            while( m_matcher.contains( tstChange,m_urlPattern ) ) {
492                final MatchResult m = m_matcher.getMatch();
493                tstChange = tstChange.substring( m.endOffset(0) );
494                urlCounter++;
495            }
496
497            if( urlCounter > m_maxUrls ) {
498                final Host host = new Host( addr, null );
499                m_temporaryBanList.add( host );
500
501                final String uid = log( context, REJECT, REASON_TOO_MANY_URLS, change.toString() );
502                log.info( "SPAM:TooManyUrls (" + uid + "). Added host " + addr + " to temporary ban list for adding too many URLs" );
503                checkStrategy( context, REASON_TOO_MANY_URLS, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" );
504            }
505
506            //  Check bot trap
507            checkBotTrap( context, change );
508
509            //  Check UTF-8 mangling
510            checkUTF8( context, change );
511
512            //  Do Akismet check.  This is good to be the last, because this is the most expensive operation.
513            checkAkismet( context, change );
514
515            m_lastModifications.add( new Host( addr, change ) );
516        }
517    }
518
519
520    /**
521     *  Checks against the akismet system.
522     *
523     * @param context
524     * @param change
525     * @throws RedirectException
526     */
527    private void checkAkismet( final Context context, final Change change ) throws RedirectException {
528        if( m_akismetAPIKey != null ) {
529            if( m_akismet == null ) {
530                log.info( "Initializing Akismet spam protection." );
531                m_akismet = new Akismet( m_akismetAPIKey, context.getEngine().getBaseURL() );
532
533                if( !m_akismet.verifyAPIKey() ) {
534                    log.error( "Akismet API key cannot be verified.  Please check your config." );
535                    m_akismetAPIKey = null;
536                    m_akismet = null;
537                }
538            }
539
540            final HttpServletRequest req = context.getHttpRequest();
541
542            //  Akismet will mark all empty statements as spam, so we'll just ignore them.
543            if( change.m_adds == 0 && change.m_removals > 0 ) {
544                return;
545            }
546            
547            if( req != null && m_akismet != null ) {
548                log.debug( "Calling Akismet to check for spam..." );
549
550                final StopWatch sw = new StopWatch();
551                sw.start();
552
553                final String ipAddress     = HttpUtil.getRemoteAddress( req );
554                final String userAgent     = req.getHeader( "User-Agent" );
555                final String referrer      = req.getHeader( "Referer");
556                final String permalink     = context.getViewURL( context.getPage().getName() );
557                final String commentType   = context.getRequestContext().equals( ContextEnum.PAGE_COMMENT.getRequestContext() ) ? "comment" : "edit";
558                final String commentAuthor = context.getCurrentUser().getName();
559                final String commentAuthorEmail = null;
560                final String commentAuthorURL   = null;
561
562                final boolean isSpam = m_akismet.commentCheck( ipAddress,
563                                                               userAgent,
564                                                               referrer,
565                                                               permalink,
566                                                               commentType,
567                                                               commentAuthor,
568                                                               commentAuthorEmail,
569                                                               commentAuthorURL,
570                                                               change.toString(),
571                                                               null );
572
573                sw.stop();
574                log.debug( "Akismet request done in: " + sw );
575
576                if( isSpam ) {
577                    // Host host = new Host( ipAddress, null );
578                    // m_temporaryBanList.add( host );
579
580                    final String uid = log( context, REJECT, REASON_AKISMET, change.toString() );
581                    log.info( "SPAM:Akismet (" + uid + "). Akismet thinks this change is spam; added host to temporary ban list." );
582                    checkStrategy( context, REASON_AKISMET, "Akismet tells Herb you're a spammer, Herb trusts Akismet, and I trust Herb! (Incident code " + uid + ")" );
583                }
584            }
585        }
586    }
587
588    /**
589     * Returns a static string which can be used to detect spambots which just wildly fill in all the fields.
590     *
591     * @return A string
592     */
593    public static String getBotFieldName() {
594        return "submit_auth";
595    }
596
597    /**
598     * This checks whether an invisible field is available in the request, and whether it's contents are suspected spam.
599     *
600     * @param context
601     * @param change
602     * @throws RedirectException
603     */
604    private void checkBotTrap( final Context context, final Change change ) throws RedirectException {
605        final HttpServletRequest request = context.getHttpRequest();
606        if( request != null ) {
607            final String unspam = request.getParameter( getBotFieldName() );
608            if( unspam != null && unspam.length() > 0 ) {
609                final String uid = log( context, REJECT, REASON_BOT_TRAP, change.toString() );
610
611                log.info( "SPAM:BotTrap (" + uid + ").  Wildly behaving bot detected." );
612                checkStrategy( context, REASON_BOT_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" );
613            }
614        }
615    }
616
617    private void checkUTF8( final Context context, final Change change ) throws RedirectException {
618        final HttpServletRequest request = context.getHttpRequest();
619        if( request != null ) {
620            final String utf8field = request.getParameter( "encodingcheck" );
621            if( utf8field != null && !utf8field.equals( "\u3041" ) ) {
622                final String uid = log( context, REJECT, REASON_UTF8_TRAP, change.toString() );
623
624                log.info( "SPAM:UTF8Trap (" + uid + ").  Wildly posting dumb bot detected." );
625                checkStrategy( context, REASON_UTF8_TRAP, "Spamming attempt detected. (Incident code " + uid + ")" );
626            }
627        }
628    }
629
630    /** Goes through the ban list and cleans away any host which has expired from it. */
631    private synchronized void cleanBanList() {
632        final long now = System.currentTimeMillis();
633        for( final Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) {
634            final Host host = i.next();
635
636            if( host.getReleaseTime() < now ) {
637                log.debug( "Removed host " + host.getAddress() + " from temporary ban list (expired)" );
638                i.remove();
639            }
640        }
641    }
642
643    /**
644     *  Checks the ban list if the IP address of the changer is already on it.
645     *
646     *  @param context
647     *  @throws RedirectException
648     */
649    private void checkBanList( final Context context, final Change change ) throws RedirectException {
650        final HttpServletRequest req = context.getHttpRequest();
651
652        if( req != null ) {
653            final String remote = HttpUtil.getRemoteAddress(req);
654            final long now = System.currentTimeMillis();
655
656            for( final Host host : m_temporaryBanList ) {
657                if( host.getAddress().equals( remote ) ) {
658                    final long timeleft = ( host.getReleaseTime() - now ) / 1000L;
659
660                    log( context, REJECT, REASON_IP_BANNED_TEMPORARILY, change.m_change );
661                    checkStrategy( context, REASON_IP_BANNED_TEMPORARILY,
662                            "You have been temporarily banned from modifying this wiki. (" + timeleft + " seconds of ban left)" );
663                }
664            }
665        }
666    }
667
668    /**
669     *  If the spam filter notices changes in the black list page, it will refresh them automatically.
670     *
671     *  @param context associated WikiContext
672     */
673    private void refreshBlacklists( final Context context ) {
674        try {
675            boolean rebuild = false;
676
677            //  Rebuild, if the spam words page, the attachment or the IP ban page has changed since.
678            final Page sourceSpam = context.getEngine().getManager( PageManager.class ).getPage( m_forbiddenWordsPage );
679            if( sourceSpam != null ) {
680                if( m_spamPatterns == null || m_spamPatterns.isEmpty() || sourceSpam.getLastModified().after( m_lastRebuild ) ) {
681                    rebuild = true;
682                }
683            }
684
685            final Attachment att = context.getEngine().getManager( AttachmentManager.class ).getAttachmentInfo( context, m_blacklist );
686            if( att != null ) {
687                if( m_spamPatterns == null || m_spamPatterns.isEmpty() || att.getLastModified().after( m_lastRebuild ) ) {
688                    rebuild = true;
689                }
690            }
691
692            final Page sourceIPs = context.getEngine().getManager( PageManager.class ).getPage( m_forbiddenIPsPage );
693            if( sourceIPs != null ) {
694                if( m_IPPatterns == null || m_IPPatterns.isEmpty() || sourceIPs.getLastModified().after( m_lastRebuild ) ) {
695                    rebuild = true;
696                }
697            }
698
699            //  Do the actual rebuilding.  For simplicity's sake, we always rebuild the complete filter list regardless of what changed.
700            if( rebuild ) {
701                m_lastRebuild = new Date();
702                m_spamPatterns = parseWordList( sourceSpam, ( sourceSpam != null ) ? sourceSpam.getAttribute( LISTVAR ) : null );
703
704                log.info( "Spam filter reloaded - recognizing " + m_spamPatterns.size() + " patterns from page " + m_forbiddenWordsPage );
705
706                m_IPPatterns = parseWordList( sourceIPs,  ( sourceIPs != null ) ? sourceIPs.getAttribute( LISTIPVAR ) : null );
707                log.info( "IP filter reloaded - recognizing " + m_IPPatterns.size() + " patterns from page " + m_forbiddenIPsPage );
708
709                if( att != null ) {
710                    final InputStream in = context.getEngine().getManager( AttachmentManager.class ).getAttachmentStream(att);
711                    final StringWriter out = new StringWriter();
712                    FileUtil.copyContents( new InputStreamReader( in, StandardCharsets.UTF_8 ), out );
713                    final Collection< Pattern > blackList = parseBlacklist( out.toString() );
714                    log.info( "...recognizing additional " + blackList.size() + " patterns from blacklist " + m_blacklist );
715                    m_spamPatterns.addAll( blackList );
716                }
717            }
718        } catch( final IOException ex ) {
719            log.info( "Unable to read attachment data, continuing...", ex );
720        } catch( final ProviderException ex ) {
721            log.info( "Failed to read spam filter attachment, continuing...", ex );
722        }
723    }
724
725    /**
726     *  Does a check against a known pattern list.
727     *
728     *  @param context
729     *  @param content
730     *  @param change
731     *  @throws RedirectException
732     */
733    private void checkPatternList( final Context context, final String content, final Change change ) throws RedirectException {
734        // If we have no spam patterns defined, or we're trying to save the page containing the patterns, just return.
735        if( m_spamPatterns == null || context.getPage().getName().equals( m_forbiddenWordsPage ) ) {
736            return;
737        }
738
739        String ch = change.toString();
740        if( context.getHttpRequest() != null ) {
741            ch += HttpUtil.getRemoteAddress( context.getHttpRequest() );
742        }
743
744        for( final Pattern p : m_spamPatterns ) {
745            // log.debug("Attempting to match page contents with "+p.getPattern());
746
747            if( m_matcher.contains( ch, p ) ) {
748                //  Spam filter has a match.
749                final String uid = log( context, REJECT, REASON_REGEXP + "(" + p.getPattern() + ")", ch );
750
751                log.info( "SPAM:Regexp (" + uid + "). Content matches the spam filter '" + p.getPattern() + "'" );
752                checkStrategy( context, REASON_REGEXP, "Herb says '" + p.getPattern() + "' is a bad spam word and I trust Herb! (Incident code " + uid + ")" );
753            }
754        }
755    }
756
757
758    /**
759     *  Does a check against a pattern list of IPs.
760     *
761     *  @param context
762     *  @throws RedirectException
763     */
764    private void checkIPList( final Context context ) throws RedirectException {
765        //  If we have no IP patterns defined, or we're trying to save the page containing the IP patterns, just return.
766        if( m_IPPatterns == null || context.getPage().getName().equals( m_forbiddenIPsPage ) ) {
767            return;
768        }
769
770        final String remoteIP = HttpUtil.getRemoteAddress( context.getHttpRequest() );
771        log.info("Attempting to match remoteIP " + remoteIP + " against " + m_IPPatterns.size() + " patterns");
772
773        for( final Pattern p : m_IPPatterns ) {
774             log.debug("Attempting to match remoteIP with " + p.getPattern());
775
776            if( m_matcher.contains( remoteIP, p ) ) {
777
778                //  IP filter has a match.
779                //
780                final String uid = log( context, REJECT, REASON_IP_BANNED_PERMANENTLY + "(" + p.getPattern() + ")", remoteIP );
781
782                log.info( "SPAM:IPBanList (" + uid + "). remoteIP matches the IP filter '" + p.getPattern() + "'" );
783                checkStrategy( context, REASON_IP_BANNED_PERMANENTLY, "Herb says '" + p.getPattern() + "' is a banned IP and I trust Herb! (Incident code " + uid + ")" );
784            }
785        }
786    }
787
788    private void checkPatternList( final Context context, final String content, final String change ) throws RedirectException {
789        final Change c = new Change();
790        c.m_change = change;
791        checkPatternList( context, content, c );
792    }
793 
794    /**
795     *  Creates a simple text string describing the added content.
796     *
797     *  @param context
798     *  @param newText
799     *  @return Empty string, if there is no change.
800     */
801    private static Change getChange( final Context context, final String newText ) {
802        final Page page = context.getPage();
803        final StringBuffer change = new StringBuffer();
804        final Engine engine = context.getEngine();
805        // Get current page version
806
807        final Change ch = new Change();
808        
809        try {
810            final String oldText = engine.getManager( PageManager.class ).getPureText( page.getName(), WikiProvider.LATEST_VERSION );
811            final String[] first  = Diff.stringToArray( oldText );
812            final String[] second = Diff.stringToArray( newText );
813            final Revision rev = Diff.diff( first, second, new MyersDiff() );
814
815            if( rev == null || rev.size() == 0 ) {
816                return ch;
817            }
818            
819            for( int i = 0; i < rev.size(); i++ ) {
820                final Delta d = rev.getDelta( i );
821
822                if( d instanceof AddDelta ) {
823                    d.getRevised().toString( change, "", "\r\n" );
824                    ch.m_adds++;
825                    
826                } else if( d instanceof ChangeDelta ) {
827                    d.getRevised().toString( change, "", "\r\n" );
828                    ch.m_adds++;
829                    
830                } else if( d instanceof DeleteDelta ) {
831                    ch.m_removals++;
832                }
833            }
834        } catch( final DifferentiationFailedException e ) {
835            log.error( "Diff failed", e );
836        }
837
838        //  Don't forget to include the change note, too
839        final String changeNote = page.getAttribute( Page.CHANGENOTE );
840        if( changeNote != null ) {
841            change.append( "\r\n" );
842            change.append( changeNote );
843        }
844
845        //  And author as well
846        if( page.getAuthor() != null ) {
847            change.append( "\r\n" + page.getAuthor() );
848        }
849
850        ch.m_change = change.toString();
851        return ch;
852    }
853
854    /**
855     *  Returns true, if this user should be ignored.  For example, admin users.
856     *
857     * @param context
858     * @return True, if this users should be ignored.
859     */
860    private boolean ignoreThisUser( final Context context ) {
861        if( context.hasAdminPermissions() ) {
862            return true;
863        }
864
865        if( m_ignoreAuthenticated && context.getWikiSession().isAuthenticated() ) {
866            return true;
867        }
868
869        if( context.getVariable( "captcha" ) != null ) {
870            return true;
871        }
872
873        return false;
874    }
875
876    /**
877     *  Returns a random string of six uppercase characters.
878     *
879     *  @return A random string
880     */
881    private static String getUniqueID() {
882        final StringBuilder sb = new StringBuilder();
883        for( int i = 0; i < 6; i++ ) {
884            final char x = ( char )( 'A' + RANDOM.nextInt( 26 ) );
885            sb.append( x );
886        }
887
888        return sb.toString();
889    }
890
891    /**
892     *  Returns a page to which we shall redirect, based on the current value of the "captcha" parameter.
893     *
894     *  @param ctx WikiContext
895     *  @return An URL to redirect to
896     */
897    private String getRedirectPage( final Context ctx ) {
898        if( m_useCaptcha ) {
899            return ctx.getURL( ContextEnum.PAGE_NONE.getRequestContext(), "Captcha.jsp", "page= " +ctx.getEngine().encodeName( ctx.getPage().getName() ) );
900        }
901
902        return ctx.getURL( ContextEnum.PAGE_VIEW.getRequestContext(), m_errorPage );
903    }
904
905    /**
906     *  Checks whether the UserProfile matches certain checks.
907     *
908     *  @param profile The profile to check
909     *  @param context The WikiContext
910     *  @return False, if this userprofile is suspect and should not be allowed to be added.
911     *  @since 2.6.1
912     */
913    public boolean isValidUserProfile( final Context context, final UserProfile profile ) {
914        try {
915            checkPatternList( context, profile.getEmail(), profile.getEmail() );
916            checkPatternList( context, profile.getFullname(), profile.getFullname() );
917            checkPatternList( context, profile.getLoginName(), profile.getLoginName() );
918        } catch( final RedirectException e ) {
919            log.info("Detected attempt to create a spammer user account (see above for rejection reason)");
920            return false;
921        }
922
923        return true;
924    }
925
926    /**
927     *  This method is used to calculate an unique code when submitting the page to detect edit conflicts.  
928     *  It currently incorporates the last-modified date of the page, and the IP address of the submitter.
929     *
930     *  @param page The WikiPage under edit
931     *  @param request The HTTP Request
932     *  @since 2.6
933     *  @return A hash value for this page and session
934     */
935    public static final String getSpamHash( final Page page, final HttpServletRequest request ) {
936        long lastModified = 0;
937
938        if( page.getLastModified() != null ) {
939            lastModified = page.getLastModified().getTime();
940        }
941        final long remote = HttpUtil.getRemoteAddress( request ).hashCode();
942
943        return Long.toString( lastModified ^ remote );
944    }
945
946    /**
947     *  Returns the name of the hash field to be used in this request. The value is unique per session, and once 
948     *  the session has expired, you cannot edit anymore.
949     *
950     *  @param request The page request
951     *  @return The name to be used in the hash field
952     *  @since  2.6
953     */
954    public static final String getHashFieldName( final HttpServletRequest request ) {
955        String hash = null;
956
957        if( request.getSession() != null ) {
958            hash = ( String )request.getSession().getAttribute( "_hash" );
959
960            if( hash == null ) {
961                hash = c_hashName;
962                request.getSession().setAttribute( "_hash", hash );
963            }
964        }
965
966        if( c_hashName == null || c_lastUpdate < ( System.currentTimeMillis() - HASH_DELAY * 60 * 60 * 1000 ) ) {
967            c_hashName = getUniqueID().toLowerCase();
968            c_lastUpdate = System.currentTimeMillis();
969        }
970
971        return hash != null ? hash : c_hashName;
972    }
973
974
975    /**
976     *  This method checks if the hash value is still valid, i.e. if it exists at all. This can occur in two cases: 
977     *  either this is a spam bot which is not adaptive, or it is someone who has been editing one page for too long, 
978     *  and their session has expired.
979     *  <p>
980     *  This method puts a redirect to the http response field to page "SessionExpired" and logs the incident in 
981     *  the spam log (it may or may not be spam, but it's rather likely that it is).
982     *
983     *  @param context The WikiContext
984     *  @param pageContext The JSP PageContext.
985     *  @return True, if hash is okay.  False, if hash is not okay, and you need to redirect.
986     *  @throws IOException If redirection fails
987     *  @since 2.6
988     */
989    public static final boolean checkHash( final Context context, final PageContext pageContext ) throws IOException {
990        final String hashName = getHashFieldName( (HttpServletRequest)pageContext.getRequest() );
991        if( pageContext.getRequest().getParameter(hashName) == null ) {
992            if( pageContext.getAttribute( hashName ) == null ) {
993                final Change change = getChange( context, EditorManager.getEditedText( pageContext ) );
994                log( context, REJECT, "MissingHash", change.m_change );
995
996                final String redirect = context.getURL( ContextEnum.PAGE_VIEW.getRequestContext(),"SessionExpired" );
997                ( ( HttpServletResponse )pageContext.getResponse() ).sendRedirect( redirect );
998                return false;
999            }
1000        }
1001
1002        return true;
1003    }
1004
1005    /**
1006     * This helper method adds all the input fields to your editor that the SpamFilter requires
1007     * to check for spam.  This <i>must</i> be in your editor form if you intend to use the SpamFilter.
1008     *  
1009     * @param pageContext The PageContext
1010     * @return A HTML string which contains input fields for the SpamFilter.
1011     */
1012    public static final String insertInputFields( final PageContext pageContext ) {
1013        final Context ctx = Context.findContext( pageContext );
1014        final Engine engine = ctx.getEngine();
1015        final StringBuilder sb = new StringBuilder();
1016        if( engine.getContentEncoding().equals( StandardCharsets.UTF_8 ) ) {
1017            sb.append( "<input name='encodingcheck' type='hidden' value='\u3041' />\n" );
1018        }
1019
1020        return sb.toString();
1021    }
1022    
1023    /**
1024     *  A local class for storing host information.
1025     *
1026     *  @since
1027     */
1028    private class Host {
1029
1030        private long m_addedTime = System.currentTimeMillis();
1031        private long m_releaseTime;
1032        private String m_address;
1033        private Change m_change;
1034
1035        public String getAddress() {
1036            return m_address;
1037        }
1038
1039        public long getReleaseTime() {
1040            return m_releaseTime;
1041        }
1042
1043        public long getAddedTime() {
1044            return m_addedTime;
1045        }
1046
1047        public Change getChange() {
1048            return m_change;
1049        }
1050
1051        public Host( final String ipaddress, final Change change ) {
1052            m_address = ipaddress;
1053            m_change = change;
1054            m_releaseTime = System.currentTimeMillis() + m_banTime * 60 * 1000L;
1055        }
1056        
1057    }
1058    
1059    private static class Change {
1060        
1061        public String m_change;
1062        public int    m_adds;
1063        public int    m_removals;
1064
1065        @Override public String toString() {
1066            return m_change;
1067        }
1068
1069        @Override public boolean equals( final Object o ) {
1070            if( o instanceof Change ) {
1071                return m_change.equals( ( ( Change )o ).m_change );
1072            }
1073            return false;
1074        }
1075
1076        @Override public int hashCode() {
1077            return m_change.hashCode() + 17;
1078        }
1079        
1080    }
1081
1082}