Guide to use threads for scraping - load x (and only x) pages at a time, and write result to db for each thread
old thread with question removed :-)
Instead I thought I would add something to the community.
Finally my threaded project is working correctly! :-)
For everyone else needing to run workerthreads (because eg. the page loads slow but you have lots of processing power to do regex).
I am currently spawning 5 threads at a time, but you could easily go with 10 or more.
I am using public proxies. These need to be set in every thread spawned. I have set them so the same pool is accessible in all threads.
A rather inelegant way of testing weather the thread is finished was needed, and since Session.isCompleted() is a boolean (not Boolean) we can't really synchronize on that variable in a good way. tell me if you find a better way :-)
* The Class ThreadRunner.
*/
public class ThreadRunner {
/** The max threads. */
private int maxThreads;
/** The max queue size. */
private int maxQueueSize;
/** The vars that should be transferred from main session to the workers. This is a good place to store DB objects and so on. */
private ScreenScraperRows vars;
/** The scrapeable session name. */
private String scrapeableSessionName;
/** The session. */
private ScrapingSession session;
/**
* Instantiates a new thread runner.
*
* @param session the session
* @param vars the vars
* @param maxThreads the max threads
* @param maxQueueSize the max queue size
* @param scrapeableSessionName the scrapeable session name
* @throws Exception the exception
*/
public ThreadRunner(ScrapingSession session, ScreenScraperRows vars, int maxThreads, int maxQueueSize, String scrapeableSessionName) throws Exception{
this.maxThreads = maxThreads;
this.maxQueueSize = maxQueueSize;
this.scrapeableSessionName = scrapeableSessionName;
this.session = session;
this.vars = vars;
}
/**
* Start scrape.
*
* @throws Exception the exception
*/
public void startScrape() throws Exception{
WorkQueue wq = new WorkQueue( maxThreads, maxThreads, maxQueueSize, 30);
Enumeration<ScreenScraperRow> elements = vars.elements();
while (elements.hasMoreElements()){
wq.execute(new ScrapingThread( elements.nextElement(), scrapeableSessionName) );
}
wq.shutDown();
}
/**
* The Class ScrapingThread.
*/
private class ScrapingThread extends Thread{
/** The t session. */
RunnableScrapingSession tSession;
/** The vars. */
ScreenScraperRow vars;
/** The scrapeable session name. */
String scrapeableSessionName;
/** The psp. */
ProxyServerPool psp;
/**
* Instantiates a new scraping thread.
*
* @param vars the vars
* @param scrapeableSessionName the scrapeable session name
* @throws Exception the exception
*/
public ScrapingThread(ScreenScraperRow vars, String scrapeableSessionName) throws Exception{
ScrapingThread.this.vars = vars;
ScrapingThread.this.scrapeableSessionName = scrapeableSessionName;
ScrapingThread.this.psp = ThreadRunner.this.session.getProxyServerPool();
}
/* (non-Javadoc)
* @see java.lang.Thread#run()
*/
public void run() {
try {
//create a new scraping session
ScrapingThread.this.tSession = new RunnableScrapingSession(scrapeableSessionName);
}
catch (Exception e) {
e.printStackTrace();
}
//copying all the vars we want to use in the new session
Enumeration<String> keys = vars.keys();
while (keys.hasMoreElements()){
String key = keys.nextElement();
ScrapingThread.this.tSession.setVariable(key, vars.get(key));
}
//remember the pool!
ScrapingThread.this.tSession.setVariable("proxyServerPool", ScrapingThread.this.psp);
//start the scrape job
ScrapingThread.this.tSession.scrape();
// set a 1 minute timeout for each thread
for (int i = 0; i<30; i++){
try {
//sleep for 2 seconds
sleep(2000);
//see if the thread is completed
if (ScrapingThread.this.tSession.isCompleted()){
break; //yay the thread was complete, and we can move on!
}
} catch (InterruptedException e) {
e.printStackTrace();
}
}
try {
this.finalize(); //Dunno if this makes a difference at all, but I hope GC will speed up
}
catch (Throwable e) {
e.printStackTrace();
}
}
}
}
the WorkQueue :
* The Class WorkQueue.
*/
public final class WorkQueue {
/** The pool size. */
static int poolSize = 5;
/** The max pool size. */
static int maxPoolSize = 5;
/** The keep alive time. */
static long keepAliveTime = 60;
/** The max queue size. */
static int maxQueueSize = 500;
/** The thread pool. */
ThreadPoolExecutor threadPool = null;
/** The queue. */
ArrayBlockingQueue<Runnable> queue;
/**
* Instantiates a new work queue.
*/
public WorkQueue() {
this( poolSize, maxPoolSize, maxQueueSize, keepAliveTime );
}
/**
* Instantiates a new work queue.
*
* @param poolSize the pool size
* @param maxPoolSize the max pool size
* @param maxQueueSize the max queue size
* @param keepAliveTime the keep alive time
*/
public WorkQueue(int poolSize, int maxPoolSize, int maxQueueSize, long keepAliveTime) {
this.queue = new ArrayBlockingQueue<Runnable>(maxQueueSize);
threadPool = new ThreadPoolExecutor(poolSize, maxPoolSize, keepAliveTime, TimeUnit.SECONDS, queue);
}
/**
* Execute.
*
* @param task the task
*/
public void execute(Runnable task) {
threadPool.execute(task);
}
/**
* Shut down.
*/
public void shutDown()
{
threadPool.shutdown();
}
/**
* Gets the task count.
*
* @return the task count
*/
public Long getTaskCount(){
return threadPool.getTaskCount();
}
}
Another edit:
The objects called ScreenScraperRow and ScreenScraperRows
/**
* The Class ScreenScraperRow.
*/
public class ScreenScraperRow extends Hashtable<String, Object> {
/** The Constant serialVersionUID. */
private static final long serialVersionUID = 7645367091883512144L;
/**
* Instantiates a new screen scraper row.
*
* @param initialCapacity the initial capacity
*/
public ScreenScraperRow(int initialCapacity){
super(initialCapacity);
}
}
/**
* The Class ScreenScraperRows.
*/
public class ScreenScraperRows extends Hashtable<Integer, ScreenScraperRow> {
/** The Constant serialVersionUID. */
private static final long serialVersionUID = -8207232001472178116L;
/**
* Instantiates a new screen scraper rows.
*
* @param initialCapcity the initial capcity
*/
public ScreenScraperRows(int initialCapcity){
super(initialCapcity);
}
}
Have fun :-)
Very impressive! Thanks for
Very impressive! Thanks for sharing. I'm the original author of screen-scraper, and it's great to see people using it in ways that I never even considered :)
Todd Wilson