Guide to use threads for scraping - load x (and only x) pages at a time, and write result to db for each thread

old thread with question removed :-)

Instead I thought I would add something to the community.

Finally my threaded project is working correctly! :-)

For everyone else needing to run workerthreads (because eg. the page loads slow but you have lots of processing power to do regex).
I am currently spawning 5 threads at a time, but you could easily go with 10 or more.

I am using public proxies. These need to be set in every thread spawned. I have set them so the same pool is accessible in all threads.

A rather inelegant way of testing weather the thread is finished was needed, and since Session.isCompleted() is a boolean (not Boolean) we can't really synchronize on that variable in a good way. tell me if you find a better way :-)

/**
 * The Class ThreadRunner.
 */
public class ThreadRunner {
       
        /** The max threads. */
        private int maxThreads;
       
        /** The max queue size. */
        private int maxQueueSize;
       
        /** The vars that should be transferred from main session to the workers. This is a good place to store DB objects and so on. */
        private ScreenScraperRows vars;
       
        /** The scrapeable session name. */
        private String scrapeableSessionName;
       
        /** The session. */
        private ScrapingSession session;
       
        /**
         * Instantiates a new thread runner.
         *
         * @param session the session
         * @param vars the vars
         * @param maxThreads the max threads
         * @param maxQueueSize the max queue size
         * @param scrapeableSessionName the scrapeable session name
         * @throws Exception the exception
         */
        public ThreadRunner(ScrapingSession session, ScreenScraperRows vars, int maxThreads, int maxQueueSize, String scrapeableSessionName) throws Exception{
               
                this.maxThreads = maxThreads;
                this.maxQueueSize = maxQueueSize;
                this.scrapeableSessionName = scrapeableSessionName;
                this.session = session;
                this.vars = vars;
               
        }
       
       
        /**
         * Start scrape.
         *
         * @throws Exception the exception
         */
        public void startScrape() throws Exception{
               
                WorkQueue wq = new WorkQueue( maxThreads, maxThreads, maxQueueSize, 30);
                Enumeration<ScreenScraperRow> elements =  vars.elements();
                while (elements.hasMoreElements()){
                        wq.execute(new ScrapingThread( elements.nextElement(), scrapeableSessionName) );
                }
                wq.shutDown();
               
        }
       
        /**
         * The Class ScrapingThread.
         */
        private class ScrapingThread extends Thread{
               
                /** The t session. */
                RunnableScrapingSession tSession;
               
                /** The vars. */
                ScreenScraperRow vars;
               
                /** The scrapeable session name. */
                String scrapeableSessionName;
               
                /** The psp. */
                ProxyServerPool psp;
               
                /**
                 * Instantiates a new scraping thread.
                 *
                 * @param vars the vars
                 * @param scrapeableSessionName the scrapeable session name
                 * @throws Exception the exception
                 */
                public ScrapingThread(ScreenScraperRow vars, String scrapeableSessionName) throws Exception{                   
                        ScrapingThread.this.vars = vars;
                        ScrapingThread.this.scrapeableSessionName = scrapeableSessionName;
                        ScrapingThread.this.psp = ThreadRunner.this.session.getProxyServerPool();
                }
               
                /* (non-Javadoc)
                 * @see java.lang.Thread#run()
                 */
                public void run() {
                       
                        try {
                                //create a new scraping session
                                ScrapingThread.this.tSession = new RunnableScrapingSession(scrapeableSessionName);
                        }
                        catch (Exception e) {
                                e.printStackTrace();
                        }
                        //copying all the vars we want to use in the new session
                        Enumeration<String> keys =  vars.keys();
                        while (keys.hasMoreElements()){
                                String key = keys.nextElement();
                                ScrapingThread.this.tSession.setVariable(key, vars.get(key));
                        }
                        //remember the pool!
                        ScrapingThread.this.tSession.setVariable("proxyServerPool", ScrapingThread.this.psp);
                        //start the scrape job
                        ScrapingThread.this.tSession.scrape();
                       
                        // set a 1 minute timeout for each thread
                        for (int i = 0; i<30; i++){
                                try {
                                        //sleep for 2 seconds
                                        sleep(2000);
                                        //see if the thread is completed
                                        if (ScrapingThread.this.tSession.isCompleted()){
                                                break; //yay the thread was complete, and we can move on!
                                        }
                                } catch (InterruptedException e) {
                                        e.printStackTrace();
                                }
                        }
                        try {
                                this.finalize(); //Dunno if this makes a difference at all, but I hope GC will speed up
                        }
                        catch (Throwable e) {  
                                e.printStackTrace();
                        }                              
                }
               
        }      
}

the WorkQueue :

/**
 * The Class WorkQueue.
 */
public final class WorkQueue {
       
    /** The pool size. */
    static int poolSize = 5;
         
    /** The max pool size. */
    static int maxPoolSize = 5;
 
    /** The keep alive time. */
    static long keepAliveTime = 60;
   
    /** The max queue size. */
    static int maxQueueSize = 500;
 
    /** The thread pool. */
    ThreadPoolExecutor threadPool = null;
 
    /** The queue. */
    ArrayBlockingQueue<Runnable> queue;
 
    /**
     * Instantiates a new work queue.
     */
    public WorkQueue() {
        this( poolSize, maxPoolSize, maxQueueSize, keepAliveTime );
    }
   
    /**
     * Instantiates a new work queue.
     *
     * @param poolSize the pool size
     * @param maxPoolSize the max pool size
     * @param maxQueueSize the max queue size
     * @param keepAliveTime the keep alive time
     */
    public WorkQueue(int poolSize, int maxPoolSize, int maxQueueSize, long keepAliveTime) {
        this.queue = new ArrayBlockingQueue<Runnable>(maxQueueSize);
        threadPool = new ThreadPoolExecutor(poolSize, maxPoolSize, keepAliveTime, TimeUnit.SECONDS, queue);
    }
 
    /**
     * Execute.
     *
     * @param task the task
     */
    public void execute(Runnable task) {
       
        threadPool.execute(task);
     
    }
 
    /**
     * Shut down.
     */
    public void shutDown()
    {
        threadPool.shutdown();
    }
   
    /**
     * Gets the task count.
     *
     * @return the task count
     */
    public Long getTaskCount(){
       
        return threadPool.getTaskCount();
       
    }
}

Another edit:
The objects called ScreenScraperRow and ScreenScraperRows

import java.util.Hashtable;

/**
 * The Class ScreenScraperRow.
 */
public class ScreenScraperRow extends Hashtable<String, Object> {

        /** The Constant serialVersionUID. */
        private static final long serialVersionUID = 7645367091883512144L;
       
        /**
         * Instantiates a new screen scraper row.
         *
         * @param initialCapacity the initial capacity
         */
        public ScreenScraperRow(int initialCapacity){
                super(initialCapacity);
        }
}

import java.util.Hashtable;

/**
 * The Class ScreenScraperRows.
 */
public class ScreenScraperRows extends Hashtable<Integer, ScreenScraperRow> {

        /** The Constant serialVersionUID. */
        private static final long serialVersionUID = -8207232001472178116L;
       
        /**
         * Instantiates a new screen scraper rows.
         *
         * @param initialCapcity the initial capcity
         */
        public ScreenScraperRows(int initialCapcity){
                super(initialCapcity);
               
        }      
}

Have fun :-)

Very impressive! Thanks for

Very impressive! Thanks for sharing. I'm the original author of screen-scraper, and it's great to see people using it in ways that I never even considered :)

Todd Wilson