public class Spider extends Object implements Runnable, Task
Downloader,
Scheduler,
PageProcessor,
Pipeline| 限定符和类型 | 类和说明 |
|---|---|
static class |
Spider.Status |
| 限定符和类型 | 字段和说明 |
|---|---|
protected boolean |
destroyWhenExit |
protected Downloader |
downloader |
protected ExecutorService |
executorService |
protected boolean |
exitWhenComplete |
protected org.slf4j.Logger |
logger |
protected PageProcessor |
pageProcessor |
protected List<Pipeline> |
pipelines |
protected Scheduler |
scheduler |
protected Site |
site |
protected boolean |
spawnUrl |
protected List<Request> |
startRequests |
protected AtomicInteger |
stat |
protected static int |
STAT_INIT |
protected static int |
STAT_RUNNING |
protected static int |
STAT_STOPPED |
protected int |
threadNum |
protected CountableThreadPool |
threadPool |
protected String |
uuid |
| 构造器和说明 |
|---|
Spider(PageProcessor pageProcessor)
create a spider with pageProcessor.
|
| 限定符和类型 | 方法和说明 |
|---|---|
Spider |
addPipeline(Pipeline pipeline)
add a pipeline for Spider
|
Spider |
addRequest(Request... requests)
Add urls with information to crawl.
|
Spider |
addUrl(String... urls)
Add urls to crawl.
|
protected void |
checkIfRunning() |
Spider |
clearPipeline()
clear the pipelines set
|
void |
close() |
static Spider |
create(PageProcessor pageProcessor)
create a spider with pageProcessor.
|
Spider |
downloader(Downloader downloader)
已过时。
|
protected void |
extractAndAddRequests(Page page,
boolean spawnUrl) |
<T> T |
get(String url) |
<T> List<T> |
getAll(Collection<String> urls)
Download urls synchronizing.
|
protected CollectorPipeline |
getCollectorPipeline() |
long |
getPageCount()
Get page count downloaded by spider.
|
Scheduler |
getScheduler() |
Site |
getSite()
site of a task
|
List<SpiderListener> |
getSpiderListeners() |
Date |
getStartTime() |
Spider.Status |
getStatus()
Get running status by spider.
|
int |
getThreadAlive()
Get thread count which is running
|
String |
getUUID()
unique id for a task.
|
protected void |
initComponent() |
boolean |
isExitWhenComplete() |
boolean |
isSpawnUrl() |
protected void |
onError(Request request) |
protected void |
onSuccess(Request request) |
Spider |
pipeline(Pipeline pipeline)
已过时。
|
protected void |
processRequest(Request request) |
void |
run() |
void |
runAsync() |
Spider |
scheduler(Scheduler scheduler)
set scheduler for Spider
|
Spider |
setDownloader(Downloader downloader)
set the downloader of spider
|
void |
setEmptySleepTime(int emptySleepTime)
Set wait time when no url is polled.
|
Spider |
setExecutorService(ExecutorService executorService) |
Spider |
setExitWhenComplete(boolean exitWhenComplete)
Exit when complete.
|
Spider |
setPipelines(List<Pipeline> pipelines)
set pipelines for Spider
|
Spider |
setScheduler(Scheduler scheduler)
set scheduler for Spider
|
Spider |
setSpawnUrl(boolean spawnUrl)
Whether add urls extracted to download.
|
Spider |
setSpiderListeners(List<SpiderListener> spiderListeners) |
Spider |
setUUID(String uuid)
Set an uuid for spider.
|
protected void |
sleep(int time) |
void |
start() |
Spider |
startRequest(List<Request> startRequests)
Set startUrls of Spider.
|
Spider |
startUrls(List<String> startUrls)
Set startUrls of Spider.
|
void |
stop() |
void |
test(String... urls)
Process specific urls without url discovering.
|
Spider |
thread(ExecutorService executorService,
int threadNum)
start with more than one threads
|
Spider |
thread(int threadNum)
start with more than one threads
|
protected Downloader downloader
protected PageProcessor pageProcessor
protected Site site
protected String uuid
protected Scheduler scheduler
protected org.slf4j.Logger logger
protected CountableThreadPool threadPool
protected ExecutorService executorService
protected int threadNum
protected AtomicInteger stat
protected boolean exitWhenComplete
protected static final int STAT_INIT
protected static final int STAT_RUNNING
protected static final int STAT_STOPPED
protected boolean spawnUrl
protected boolean destroyWhenExit
public Spider(PageProcessor pageProcessor)
pageProcessor - public static Spider create(PageProcessor pageProcessor)
pageProcessor - PageProcessorpublic Spider startUrls(List<String> startUrls)
startUrls - public Spider startRequest(List<Request> startRequests)
startRequests - public Spider setUUID(String uuid)
uuid - public Spider scheduler(Scheduler scheduler)
scheduler - setScheduler(us.codecraft.webmagic.scheduler.Scheduler)public Spider setScheduler(Scheduler scheduler)
scheduler - Schedulerpublic Spider pipeline(Pipeline pipeline)
pipeline - addPipeline(us.codecraft.webmagic.pipeline.Pipeline)public Spider addPipeline(Pipeline pipeline)
pipeline - Pipelinepublic Spider setPipelines(List<Pipeline> pipelines)
pipelines - Pipelinepublic Spider clearPipeline()
public Spider downloader(Downloader downloader)
downloader - setDownloader(us.codecraft.webmagic.downloader.Downloader)public Spider setDownloader(Downloader downloader)
downloader - Downloaderprotected void initComponent()
protected void onError(Request request)
protected void onSuccess(Request request)
public void close()
public void test(String... urls)
urls - urls to processprotected void processRequest(Request request)
protected void sleep(int time)
protected void extractAndAddRequests(Page page, boolean spawnUrl)
protected void checkIfRunning()
public void runAsync()
public <T> List<T> getAll(Collection<String> urls)
urls - protected CollectorPipeline getCollectorPipeline()
public <T> T get(String url)
public Spider addRequest(Request... requests)
requests - public void start()
public void stop()
public Spider thread(int threadNum)
threadNum - public Spider thread(ExecutorService executorService, int threadNum)
threadNum - public boolean isExitWhenComplete()
public Spider setExitWhenComplete(boolean exitWhenComplete)
exitWhenComplete - public boolean isSpawnUrl()
public long getPageCount()
public Spider.Status getStatus()
Spider.Statuspublic int getThreadAlive()
public Spider setSpawnUrl(boolean spawnUrl)
spawnUrl - public Spider setExecutorService(ExecutorService executorService)
public List<SpiderListener> getSpiderListeners()
public Spider setSpiderListeners(List<SpiderListener> spiderListeners)
public Date getStartTime()
public Scheduler getScheduler()
public void setEmptySleepTime(int emptySleepTime)
emptySleepTime - In MILLISECONDS.Copyright © 2014. All rights reserved.