public class CrawlConfig extends Object
| Constructor and Description |
|---|
CrawlConfig() |
| Modifier and Type | Method and Description |
|---|---|
void |
addAuthInfo(AuthInfo authInfo) |
List<AuthInfo> |
getAuthInfos() |
int |
getCleanupDelaySeconds() |
int |
getConnectionTimeout() |
String |
getCookiePolicy() |
org.apache.http.client.CookieStore |
getCookieStore()
Gets the configured
CookieStore or null if none is set |
String |
getCrawlStorageFolder() |
long |
getDbLockTimeout() |
Collection<org.apache.http.message.BasicHeader> |
getDefaultHeaders()
Return a copy of the default header collection.
|
org.apache.http.conn.DnsResolver |
getDnsResolver() |
int |
getMaxConnectionsPerHost() |
int |
getMaxDepthOfCrawling() |
int |
getMaxDownloadSize() |
int |
getMaxOutgoingLinksToFollow() |
int |
getMaxPagesToFetch() |
int |
getMaxTotalConnections() |
int |
getPolitenessDelay() |
String |
getProxyHost() |
String |
getProxyPassword() |
int |
getProxyPort() |
String |
getProxyUsername() |
int |
getSocketTimeout() |
int |
getThreadMonitoringDelaySeconds() |
int |
getThreadShutdownDelaySeconds() |
String |
getUserAgentString() |
boolean |
isFollowRedirects() |
boolean |
isIncludeBinaryContentInCrawling() |
boolean |
isIncludeHttpsPages() |
boolean |
isOnlineTldListUpdate() |
boolean |
isProcessBinaryContentInCrawling() |
boolean |
isRespectNoFollow()
Gets the current
CookieStore used |
boolean |
isRespectNoIndex() |
boolean |
isResumableCrawling() |
boolean |
isShutdownOnEmptyQueue() |
void |
setAuthInfos(List<AuthInfo> authInfos) |
void |
setCleanupDelaySeconds(int delay) |
void |
setConnectionTimeout(int connectionTimeout) |
void |
setCookiePolicy(String cookiePolicy) |
void |
setCookieStore(org.apache.http.client.CookieStore cookieStore)
Sets the
to be used |
void |
setCrawlStorageFolder(String crawlStorageFolder)
The folder which will be used by crawler for storing the intermediate
crawl data.
|
void |
setDbLockTimeout(long dbLockTimeout)
Set the lock timeout for the underlying sleepycat DB, in milliseconds.
|
void |
setDefaultHeaders(Collection<? extends org.apache.http.Header> defaultHeaders)
Set the default header collection (creating copies of the provided headers).
|
void |
setDnsResolver(org.apache.http.conn.DnsResolver dnsResolver)
DNS resolver to use, #
SystemDefaultDnsResolver() is default. |
void |
setFollowRedirects(boolean followRedirects) |
void |
setIncludeBinaryContentInCrawling(boolean includeBinaryContentInCrawling) |
void |
setIncludeHttpsPages(boolean includeHttpsPages) |
void |
setMaxConnectionsPerHost(int maxConnectionsPerHost) |
void |
setMaxDepthOfCrawling(int maxDepthOfCrawling)
Maximum depth of crawling For unlimited depth this parameter should be set to -1
|
void |
setMaxDownloadSize(int maxDownloadSize) |
void |
setMaxOutgoingLinksToFollow(int maxOutgoingLinksToFollow) |
void |
setMaxPagesToFetch(int maxPagesToFetch)
Maximum number of pages to fetch For unlimited number of pages, this parameter should be
set to -1
|
void |
setMaxTotalConnections(int maxTotalConnections) |
void |
setOnlineTldListUpdate(boolean online)
Should the TLD list be updated automatically on each run? Alternatively,
it can be loaded from the embedded tld-names.txt resource file that was
obtained from https://publicsuffix.org/list/effective_tld_names.dat
|
void |
setPolitenessDelay(int politenessDelay)
Politeness delay in milliseconds (delay between sending two requests to
the same host).
|
void |
setProcessBinaryContentInCrawling(boolean processBinaryContentInCrawling)
Should we process binary content such as images, audio, ...
|
void |
setProxyHost(String proxyHost) |
void |
setProxyPassword(String proxyPassword)
If crawler should run behind a proxy and user/pass is needed for
authentication in proxy, this parameter can be used for specifying the password.
|
void |
setProxyPort(int proxyPort) |
void |
setProxyUsername(String proxyUsername) |
void |
setRespectNoFollow(boolean respectNoFollow) |
void |
setRespectNoIndex(boolean respectNoIndex) |
void |
setResumableCrawling(boolean resumableCrawling)
If this feature is enabled, you would be able to resume a previously
stopped/crashed crawl.
|
void |
setShutdownOnEmptyQueue(boolean shutdown)
Should the crawler stop running when the queue is empty?
|
void |
setSocketTimeout(int socketTimeout) |
void |
setThreadMonitoringDelaySeconds(int delay) |
void |
setThreadShutdownDelaySeconds(int delay) |
void |
setUserAgentString(String userAgentString)
user-agent string that is used for representing your crawler to web
servers.
|
String |
toString() |
void |
validate()
Validates the configs specified by this instance.
|
public void setDnsResolver(org.apache.http.conn.DnsResolver dnsResolver)
SystemDefaultDnsResolver() is default.public org.apache.http.conn.DnsResolver getDnsResolver()
public void validate()
throws Exception
Exception - on Validation failpublic String getCrawlStorageFolder()
public void setCrawlStorageFolder(String crawlStorageFolder)
crawlStorageFolder - The folder for the crawler's storagepublic boolean isResumableCrawling()
public void setResumableCrawling(boolean resumableCrawling)
resumableCrawling - Should crawling be resumable between runs ?public void setDbLockTimeout(long dbLockTimeout)
dbLockTimeout - EnvironmentConfig.setLockTimeout(long, java.util.concurrent.TimeUnit)public long getDbLockTimeout()
public int getMaxDepthOfCrawling()
public void setMaxDepthOfCrawling(int maxDepthOfCrawling)
maxDepthOfCrawling - Depth of crawling (all links on current page = depth of 1)public int getMaxPagesToFetch()
public void setMaxPagesToFetch(int maxPagesToFetch)
maxPagesToFetch - How many pages to fetch from all threads together ?public String getUserAgentString()
public void setUserAgentString(String userAgentString)
userAgentString - Custom userAgent string to use as your crawler's identifierpublic Collection<org.apache.http.message.BasicHeader> getDefaultHeaders()
public void setDefaultHeaders(Collection<? extends org.apache.http.Header> defaultHeaders)
public int getPolitenessDelay()
public void setPolitenessDelay(int politenessDelay)
politenessDelay - the delay in milliseconds.public boolean isIncludeHttpsPages()
public void setIncludeHttpsPages(boolean includeHttpsPages)
includeHttpsPages - Should we crawl https pages?public boolean isIncludeBinaryContentInCrawling()
public void setIncludeBinaryContentInCrawling(boolean includeBinaryContentInCrawling)
includeBinaryContentInCrawling - Should we fetch binary content such as images,
audio, ...?public boolean isProcessBinaryContentInCrawling()
public void setProcessBinaryContentInCrawling(boolean processBinaryContentInCrawling)
public int getMaxConnectionsPerHost()
public void setMaxConnectionsPerHost(int maxConnectionsPerHost)
maxConnectionsPerHost - Maximum Connections per hostpublic int getMaxTotalConnections()
public void setMaxTotalConnections(int maxTotalConnections)
maxTotalConnections - Maximum total connectionspublic int getSocketTimeout()
public void setSocketTimeout(int socketTimeout)
socketTimeout - Socket timeout in millisecondspublic int getConnectionTimeout()
public void setConnectionTimeout(int connectionTimeout)
connectionTimeout - Connection timeout in millisecondspublic int getMaxOutgoingLinksToFollow()
public void setMaxOutgoingLinksToFollow(int maxOutgoingLinksToFollow)
maxOutgoingLinksToFollow - Max number of outgoing links which are processed from a pagepublic int getMaxDownloadSize()
public void setMaxDownloadSize(int maxDownloadSize)
maxDownloadSize - Max allowed size of a page. Pages larger than this size will not be
fetched.public boolean isFollowRedirects()
public void setFollowRedirects(boolean followRedirects)
followRedirects - Should we follow redirects?public boolean isShutdownOnEmptyQueue()
public void setShutdownOnEmptyQueue(boolean shutdown)
public boolean isOnlineTldListUpdate()
public void setOnlineTldListUpdate(boolean online)
public String getProxyHost()
public void setProxyHost(String proxyHost)
proxyHost - If crawler should run behind a proxy, this parameter can be used for
specifying the proxy host.public int getProxyPort()
public void setProxyPort(int proxyPort)
proxyPort - If crawler should run behind a proxy, this parameter can be used for
specifying the proxy port.public String getProxyUsername()
public void setProxyUsername(String proxyUsername)
proxyUsername - If crawler should run behind a proxy and user/pass is needed for
authentication in proxy, this parameter can be used for specifying the username.public String getProxyPassword()
public void setProxyPassword(String proxyPassword)
proxyPassword - String Passwordpublic void addAuthInfo(AuthInfo authInfo)
public void setAuthInfos(List<AuthInfo> authInfos)
authInfos - authenticationInformations to setpublic int getThreadMonitoringDelaySeconds()
public void setThreadMonitoringDelaySeconds(int delay)
public int getThreadShutdownDelaySeconds()
public void setThreadShutdownDelaySeconds(int delay)
public int getCleanupDelaySeconds()
public void setCleanupDelaySeconds(int delay)
public String getCookiePolicy()
public void setCookiePolicy(String cookiePolicy)
public org.apache.http.client.CookieStore getCookieStore()
CookieStore or null if none is setCookieStorepublic void setCookieStore(org.apache.http.client.CookieStore cookieStore)
to be usedcookieStore - the CookieStorepublic boolean isRespectNoFollow()
CookieStore usedCookieStorepublic void setRespectNoFollow(boolean respectNoFollow)
public boolean isRespectNoIndex()
public void setRespectNoIndex(boolean respectNoIndex)
Copyright © 2018. All rights reserved.