I'll post up details of this presentation at http://www.danshockley.com/form_processing.php
-- the example here is a search on PetFinder.com
-- version 1.0
property searchBase : "http://www.petfinder.com/pet.cgi"
property searchTerms : "action=1&pet.Animal=Dog&pet.Breed=malamute&pet.Age=Adult&pet.Size=&pet.Sex=M&location=NJ"
property beforeResultsList : "Organization"
property afterResultsList : "
"
property beforeEachResultLine : "
property blankRecord : {orgLink:"", detailLink:"", organization:"", dominantBreed:"", age:"", sex:"", petName:"", pictureLink:"", features:""}
property beforeOneField : "
property baseURLForLinks : "http://petfinder.com/"
property beforeLink : "property afterLink : "\">" property afterLinkText : "
on run
-- get the source as a string
set sourceHTML to curlSimpleDownload(searchBase, "", searchTerms)
set resultsBlock to my getTextBetween(sourceHTML, beforeResultsList, afterResultsList)
set resultsList to my getTextBetweenMultiple(resultsBlock, beforeEachResultLine, afterEachResultsLine)
set petRecordList to {}
repeat with oneRawResult in resultsList
set onePetRecord to blankRecord
set rawFields to getTextBetweenMultiple(oneRawResult, beforeOneField, afterOneField)
set relativeOrgLink to my getTextBetween(item 1 of rawFields, beforeLink, afterLink)
--set orgLink of onePetRecord to baseURLForLinks & relativeOrgLink
set organization of onePetRecord to getTextBetween(item 1 of rawFields, (relativeOrgLink & afterLink) as string, afterLinkText)
set relativeDetailLink to my getTextBetween(item 5 of rawFields, beforeLink, afterLink)
--set detailLink of onePetRecord to baseURLForLinks & relativeDetailLink
set petName of onePetRecord to getTextBetween(item 5 of rawFields, (relativeDetailLink & afterLink) as string, afterLinkText)
copy onePetRecord to end of petRecordList
end repeat
petRecordList
end run
on curlSimpleDownload(downloadURL, destExpected, theFormInfo)
-- version 1.1, Daniel A. Shockley - public domain
-- downloadURL is STRING
-- saves to destExpected (Mac path as STRING, FILE SPEC, or ALIAS), if given
-- if destExpected is "", returns source result directly as string
-- optional form data for POST - use "" for no form data
try
-- basic download to standard output
set curlCode to "curl \"" & downloadURL & "\""
if (length of theFormInfo) > 0 then
set curlCode to curlCode & " -d \"" & theFormInfo & "\""
end if
-- now, add on the desired file location, if there is one given
if destExpected is not "" then
set unixDestExpected to quoted form of POSIX path of (destExpected as string)
set curlCode to curlCode & " --output " & unixDestExpected & " --write-out \"%{http_code}\""
else -- result as string
set curlCode to curlCode & " | vis" -- pipe into vis to strip nonprintable characters
end if
set curlResponse to do shell script curlCode
return curlResponse
(*
curlResponse will be the http success code ("200"), or an error code.
If no destination was given, curlResponse will be the source
returned, and no file will be saved
*)
on error errMsg number errNum
error "curlSimpleDownload FAILED: " & errMsg number errNum
end try
end curlSimpleDownload
on simpleReplace(thisText, oldChars, newChars)
-- version 1.1
-- 1.1 coerces the newChars to a STRING, since other data types do not always coerce
-- (example, replacing "nine" with 9 as number replaces with "")
set oldDelims to AppleScript's text item delimiters
set AppleScript's text item delimiters to the oldChars
set the parsedList to every text item of thisText
set AppleScript's text item delimiters to the {(newChars as string)}
set the newText to the parsedList as string
set AppleScript's text item delimiters to oldDelims
return newText
end simpleReplace
on getTextBetween(sourceText, beforeText, afterText)
-- version 1.1
-- gets the text between the first occurrences of beforeText and afterText in sourceText
try
set oldDelims to AppleScript's text item delimiters
set AppleScript's text item delimiters to the beforeText
set the prefixRemoved to text item 2 of sourceText
set AppleScript's text item delimiters to afterText
set the finalResult to text item 1 of prefixRemoved
set AppleScript's text item delimiters to oldDelims
return finalResult
on error errMsg number errNum
set AppleScript's text item delimiters to {""}
return "" -- return nothing if the surrounding text is not found
end try
end getTextBetween
on parseChars(thisText, parseString)
-- version 1.1
try
set oldDelims to AppleScript's text item delimiters
set AppleScript's text item delimiters to the {parseString as string}
set the parsedList to every text item of thisText
set AppleScript's text item delimiters to oldDelims
return parsedList
on error errMsg number errNum
error "ERROR: parseChars() handler: " & errMsg number errNum
end try
end parseChars
on testPathExists(inputPath)
-- version 1.4
-- from Richard Morton, on applescript-users@lists.apple.com
-- public domain, of course. :-)
-- gets somewhat slower as nested-depth level goes over 10 nested folders
if inputPath is not equal to "" then try
get alias inputPath as string
return true
end try
return false
end testPathExists
on getTextBetweenMultiple(sourceText, beforeText, afterText)
-- version 1.1
-- gets the text between all occurrences of beforeText and afterText in sourceText, and returns a list of strings
-- NEEDs parseChars()
--EXAMPLE USE:
--set someNames to "name: Bob, birth: 7/23/1978; name: Dan, birth: 3/12/1975; name: Jeff, birth: 4/6/1976"
--set nameList to my getTextBetweenMultiple(someNames, "name: ", ", birth")
try
set parsedByBefore to my parseChars(sourceText, beforeText)
if length of parsedByBefore is 1 then return {}
set parsedByBefore to items 2 through -1 of parsedByBefore
set foundTextList to {}
repeat with oneParsedSection in parsedByBefore
set parsedList to my parseChars(oneParsedSection as string, afterText)
if length of parsedList is not 1 then
copy (item 1 of parsedList) as string to end of foundTextList
end if
end repeat
return foundTextList
on error errMsg number errNum
-- will not error if parsing datum not found, will return empty list (see above)
error "getTextBetweenMultiple FAILED: " & errMsg number errNum
end try
end getTextBetweenMultiple