Pretty post data

Internally pretty post data is hidden on the site so that we cannot get the parameters through the screen scraper tool but raw post data is having those parameters. The below URL is one example of this type:

http://www.centralbedfordshire.gov.uk/PLANTECH/DCWebPages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch

Kindly please help us to find out from where on the site these parameters gets passed internally.

Pretty post data

Still the dates are not getting passed in post data. The screen scraper tool is showing 95003 even i passed Registration Date From - 01/04/2011 Registration Date To - 10/04/2011. The site displays 59 results for this date range.

You'll want to import this

You'll want to import this one the same way. Notice there is an init script to allow easy setting of the dates.

<?xml version="1.0" encoding="UTF-8"?>
<scraping-session use-strict-mode="true"><script-instances><script-instances when-to-run="10" sequence="1" enabled="true"><script><script-text>session.setv("DATE_START", "01/04/2011");
session.setv("DATE_END", "10/04/2011");</script-text><name>centralbedfordshire--init</name><language>Interpreted Java</language></script></script-instances><owner-type>ScrapingSession</owner-type><owner-name>centralbedfordshire</owner-name></script-instances><name>centralbedfordshire</name><notes></notes><cookiePolicy>0</cookiePolicy><maxHTTPRequests>1</maxHTTPRequests><external_proxy_username></external_proxy_username><external_proxy_password></external_proxy_password><external_proxy_host></external_proxy_host><external_proxy_port></external_proxy_port><external_nt_proxy_username></external_nt_proxy_username><external_nt_proxy_password></external_nt_proxy_password><external_nt_proxy_domain></external_nt_proxy_domain><external_nt_proxy_host></external_nt_proxy_host><anonymize>false</anonymize><terminate_proxies_on_completion>false</terminate_proxies_on_completion><number_of_required_proxies>5</number_of_required_proxies><originator_edition>2</originator_edition><logging_level>1</logging_level><date_exported>April 27, 2011 14:34:57</date_exported><character_set>UTF-8</character_set><scrapeable-files sequence="2" will-be-invoked-manually="false" tidy-html="jtidy"><last-scraped-data></last-scraped-data><URL>http://www.centralbedfordshire.gov.uk/PLANTECH/DCWebPages/acolnetcgi.gov</URL><last-request></last-request><name>Search results</name><extractor-patterns sequence="1" automatically-save-in-session-variable="false" if-saved-in-session-variable="0" filter-duplicates="false" cache-data-set="false" will-be-invoked-manually="false"><pattern-text>&gt;1 to 10 of ~@RESULTS@~ Results&lt;</pattern-text><identifier>Count of results</identifier><extractor-pattern-tokens optional="false" save-in-session-variable="false" compound-key="true" strip-html="false" resolve-relative-url="false" replace-html-entities="false" trim-white-space="false" exclude-from-data="false" null-session-variable="false" sequence="1"><regular-expression>[\d,]+</regular-expression><identifier>RESULTS</identifier></extractor-pattern-tokens><script-instances><owner-type>ExtractorPattern</owner-type><owner-name>Count of results</owner-name></script-instances></extractor-patterns><HTTPParameters sequence="16"><key>regdate2</key><type>POST</type><value>~#DATE_END#~</value></HTTPParameters><HTTPParameters sequence="11"><key>appntype</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="23"><key>apldcndate2</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="18"><key>dcndate1</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="22"><key>apldcndate1</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="2"><key>RIPSESSION</key><type>GET</type><value>~#RIP#~</value></HTTPParameters><HTTPParameters sequence="12"><key>casetype</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="1"><key>ACTION</key><type>GET</type><value>UNWRAP</value></HTTPParameters><HTTPParameters sequence="10"><key>cnsrarea</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="5"><key>appname</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="21"><key>aplrecdate2</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="7"><key>proposal</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="9"><key>WkListDate</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="4"><key>locaddress1</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="14"><key>obligstatus</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="13"><key>statclass</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="24"><key>registername</key><type>POST</type><value>Full Register</value></HTTPParameters><HTTPParameters sequence="3"><key>casefullref</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="8"><key>parishname</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="17"><key>ComDate</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="15"><key>regdate1</key><type>POST</type><value>~#DATE_START#~</value></HTTPParameters><HTTPParameters sequence="20"><key>aplrecdate1</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="6"><key>agtname</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="19"><key>dcndate2</key><type>POST</type><value></value></HTTPParameters><script-instances><script-instances when-to-run="30" sequence="1" enabled="true"><script><script-text>scrapeableFile.setForceMultiPart( true );</script-text><name>centralbedfordshire--force multi-part</name><language>Interpreted Java</language></script></script-instances><owner-type>ScrapeableFile</owner-type><owner-name>Search results</owner-name></script-instances></scrapeable-files><scrapeable-files sequence="1" will-be-invoked-manually="false" tidy-html="jtidy"><last-scraped-data></last-scraped-data><URL>http://www.centralbedfordshire.gov.uk/PLANTECH/DCWebPages/acolnetcgi.gov</URL><last-request></last-request><name>Search page</name><extractor-patterns sequence="1" automatically-save-in-session-variable="false" if-saved-in-session-variable="0" filter-duplicates="false" cache-data-set="false" will-be-invoked-manually="false"><pattern-text>RIPSESSION=~@RIP@~"</pattern-text><identifier>Session</identifier><extractor-pattern-tokens optional="false" save-in-session-variable="false" compound-key="true" strip-html="false" resolve-relative-url="false" replace-html-entities="false" trim-white-space="false" exclude-from-data="false" null-session-variable="false" sequence="1"><regular-expression>[^"]*</regular-expression><identifier>RIP</identifier></extractor-pattern-tokens><script-instances><script-instances when-to-run="80" sequence="1" enabled="true"><script><script-text>import java.net.URLDecoder;

rip = dataRecord.get("RIP");
decoded = URLDecoder.decode(rip, "UTF-8");
session.log("Session RIP: " + decoded);
session.setv("RIP", decoded);</script-text><name>Decode RIP</name><language>Interpreted Java</language></script></script-instances><owner-type>ExtractorPattern</owner-type><owner-name>Session</owner-name></script-instances></extractor-patterns><HTTPParameters sequence="2"><key>RIPNAME</key><type>GET</type><value>Root.pgesearch</value></HTTPParameters><HTTPParameters sequence="1"><key>ACTION</key><type>GET</type><value>UNWRAP</value></HTTPParameters><script-instances><owner-type>ScrapeableFile</owner-type><owner-name>Search page</owner-name></script-instances></scrapeable-files></scraping-session>

Force Multipart

Can You please tell me, What is the use of Force Multipart Script in the session because i was not able to understand why its added in the session.

Thanks it worked.

Thanks it worked.

I've been looking at it some

I've been looking at it some more, and you're right. So I can see the need to use scrapeableFile.setRequestEntity() and scrapeableFile.setForceMultiPart(true) though it's not working yet as there is one value I don't see how to set yet. I'll keep looking.

I'm not sure I understand

I'm not sure I understand your question. On this site, I just filled in the registration to and from date, proxied the submission, and it's pretty straightforward. I whipped this up. You will need to copy this text into a text editor, save it as "test.sss" and import it to you screen-scraper to see it:

<?xml version="1.0" encoding="ISO-8859-1"?>
<scraping-session use-strict-mode="true"><script-instances><owner-type>ScrapingSession</owner-type><owner-name>.Test</owner-name></script-instances><name>.Test</name><notes></notes><cookiePolicy>0</cookiePolicy><maxHTTPRequests>1</maxHTTPRequests><external_proxy_username></external_proxy_username><external_proxy_password></external_proxy_password><external_proxy_host></external_proxy_host><external_proxy_port></external_proxy_port><external_nt_proxy_username></external_nt_proxy_username><external_nt_proxy_password></external_nt_proxy_password><external_nt_proxy_domain></external_nt_proxy_domain><external_nt_proxy_host></external_nt_proxy_host><anonymize>false</anonymize><terminate_proxies_on_completion>false</terminate_proxies_on_completion><number_of_required_proxies>5</number_of_required_proxies><originator_edition>2</originator_edition><logging_level>1</logging_level><date_exported>April 26, 2011 09:03:48</date_exported><character_set>ISO-8859-1</character_set><scrapeable-files sequence="1" will-be-invoked-manually="false" tidy-html="jtidy"><last-scraped-data></last-scraped-data><URL>http://www.centralbedfordshire.gov.uk/PLANTECH/DCWebPages/acolnetcgi.gov</URL><last-request></last-request><name>Search page</name><HTTPParameters sequence="2"><key>RIPNAME</key><type>GET</type><value>Root.pgesearch</value></HTTPParameters><HTTPParameters sequence="1"><key>ACTION</key><type>GET</type><value>UNWRAP</value></HTTPParameters><script-instances><owner-type>ScrapeableFile</owner-type><owner-name>Search page</owner-name></script-instances></scrapeable-files><scrapeable-files sequence="2" will-be-invoked-manually="false" tidy-html="jtidy"><last-scraped-data></last-scraped-data><URL>http://www.centralbedfordshire.gov.uk/PLANTECH/DCWebPages/acolnetcgi.gov</URL><last-request></last-request><name>Search results</name><extractor-patterns sequence="1" automatically-save-in-session-variable="false" if-saved-in-session-variable="0" filter-duplicates="false" cache-data-set="false" will-be-invoked-manually="false"><pattern-text>&gt;Application Number:&lt;/th&gt;&#xd;~@DATARECORD@~&#xd;&lt;table cellspacing="0" ~@junk@~="~@END@~"</pattern-text><identifier>Applications</identifier><extractor-pattern-tokens optional="false" save-in-session-variable="false" compound-key="true" strip-html="false" resolve-relative-url="false" replace-html-entities="false" trim-white-space="false" exclude-from-data="false" null-session-variable="false" sequence="1"><identifier>DATARECORD</identifier></extractor-pattern-tokens><extractor-pattern-tokens optional="false" save-in-session-variable="false" compound-key="true" strip-html="false" resolve-relative-url="false" replace-html-entities="false" trim-white-space="false" exclude-from-data="false" null-session-variable="false" sequence="3"><regular-expression>results-table|Bottom Links</regular-expression><identifier>END</identifier></extractor-pattern-tokens><extractor-pattern-tokens optional="false" save-in-session-variable="false" compound-key="true" strip-html="false" resolve-relative-url="false" replace-html-entities="false" trim-white-space="false" exclude-from-data="false" null-session-variable="false" sequence="2"><regular-expression>[^&lt;&gt;]*</regular-expression><identifier>junk</identifier></extractor-pattern-tokens><extractor-patterns sequence="1" automatically-save-in-session-variable="false" if-saved-in-session-variable="0" filter-duplicates="false" cache-data-set="false" will-be-invoked-manually="false"><pattern-text>&gt;~@CASE_NUMBER@~&lt;/a&gt;</pattern-text><extractor-pattern-tokens optional="false" save-in-session-variable="false" compound-key="true" strip-html="false" resolve-relative-url="false" replace-html-entities="false" trim-white-space="false" exclude-from-data="false" null-session-variable="false" sequence="1"><regular-expression>[^&lt;&gt;]*</regular-expression><identifier>CASE_NUMBER</identifier></extractor-pattern-tokens><script-instances/></extractor-patterns><extractor-patterns sequence="2" automatically-save-in-session-variable="false" if-saved-in-session-variable="0" filter-duplicates="false" cache-data-set="false" will-be-invoked-manually="false"><pattern-text>Registration Date:&lt;/th&gt;&#xd;
&lt;td&gt;~@DATE@~&lt;</pattern-text><extractor-pattern-tokens optional="false" save-in-session-variable="false" compound-key="true" strip-html="false" resolve-relative-url="false" replace-html-entities="false" trim-white-space="false" exclude-from-data="false" null-session-variable="false" sequence="1"><regular-expression>[^&lt;&gt;]*</regular-expression><identifier>DATE</identifier></extractor-pattern-tokens><script-instances/></extractor-patterns><script-instances><script-instances when-to-run="80" sequence="1" enabled="true"><script><script-text>startDate = dataRecord.get("DATE");

newDate = sutil.reformatDate(startDate, "dd / MM / yyyy", "yyyy-MM-dd HH:mm:ss");

session.log("Reformated date is: " + newDate);
dataRecord.put("DATE", newDate);</script-text><name>Test date</name><language>Interpreted Java</language></script></script-instances><owner-type>ExtractorPattern</owner-type><owner-name>Applications</owner-name></script-instances></extractor-patterns><HTTPParameters sequence="23"><key>apldcndate2</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="13"><key>statclass</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="24"><key>registername</key><type>POST</type><value>Full Register</value></HTTPParameters><HTTPParameters sequence="18"><key>dcndate1</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="7"><key>proposal</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="22"><key>apldcndate1</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="17"><key>ComDate</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="2"><key>RIPSESSION</key><type>GET</type><value>{[*!1D1A020B050304731D3C3D31617C041E28360020373B2D3A033A2131681B1102001216030D02040A06016F691120283F3A3606243629243A06176F74777E717F7C6B7777007E6D1D2A20212C2B20080A72616372757D787C7A646173!*]}</value></HTTPParameters><HTTPParameters sequence="20"><key>aplrecdate1</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="4"><key>locaddress1</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="8"><key>parishname</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="1"><key>ACTION</key><type>GET</type><value>UNWRAP</value></HTTPParameters><HTTPParameters sequence="16"><key>regdate2</key><type>POST</type><value>20/04/2011</value></HTTPParameters><HTTPParameters sequence="10"><key>cnsrarea</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="21"><key>aplrecdate2</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="14"><key>obligstatus</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="11"><key>appntype</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="12"><key>casetype</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="15"><key>regdate1</key><type>POST</type><value>18/04/2011</value></HTTPParameters><HTTPParameters sequence="3"><key>casefullref</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="5"><key>appname</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="19"><key>dcndate2</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="6"><key>agtname</key><type>POST</type><value></value></HTTPParameters><HTTPParameters sequence="9"><key>WkListDate</key><type>POST</type><value></value></HTTPParameters><script-instances><owner-type>ScrapeableFile</owner-type><owner-name>Search results</owner-name></script-instances></scrapeable-files></scraping-session>