 |
Forum Newbie |
Joined: Tue Jun 28, 2011 10:51 am Posts: 5
|
Hello everyone, I'll start by saying I'm not sure if this is the most appropriate place to put this question, so moderator please move as you see fit  I have written an XML parser for a fairly small file (115kb). I initially wrote it using an OO approach, and was concerned by how long the page took to load (~4-5 seconds). I thought maybe the post-processing (I read it into an array, then process through it, adding output, formatting, etc) was slowing it down, so I cut that all out, and it still took quite a long time. I decided to whip up a second version that removed the OO style and switch to a functional approach, and it cut the time down to ~1 second! A huge improvement that I was not expecting. I did clear the cache and tested the loading time to make sure I was getting a reliable answer. Does this make sense to others? I would like to stick with a more OO approach as it is easier to structure and expand should I need to, but not if it is going to cost me this extra time. Is the accessing of the object properties that much slower to do than simply accessing a variable or am I doing something in the OO code that is causing extra instances of the class to be created? The OO approach
class PubsParser
{
private $xmlDebug;
private $xmlDebugOutput;
private $p;
private $inEntry;
private $publication;
private $data;
private $curAuthor;
private $inBook;
private $pubLevel;
private $curPub;
private $curPubCount;
private $publications;
const NO_PUB_TYPE = 'No publication type declared';
public function __construct($xmlDebug=false, $xmlDebugOutput=false)
{
$this->inEntry = false;
$this->data = "";
$this->pubLevel = 0;
$this->curPubCount = 0;
$this->publications = array();
$this->xmlDebug = $xmlDebug;
$this->xmlDebugOutput = $xmlDebugOutput;
}
public function debugPrint($s)
{
if($this->xmlDebugOutput)
{
echo $s;
}
}
public function parseFile($file="pubs.xml")
{
$this->p = xml_parser_create();
xml_parser_set_option($this->p, XML_OPTION_SKIP_WHITE, 1);
xml_parser_set_option($this->p, XML_OPTION_CASE_FOLDING, 0);
if(!$this->xmlDebug)
{
xml_set_object($this->p, $this);
xml_set_element_handler($this->p, 'startElement', 'endElement');
xml_set_character_data_handler($this->p, 'contents');
$fp = fopen($file, "r") or die("Could not open file");
while($data = fread($fp, filesize($file)))
{
if(!xml_parse($this->p, $data, feof($fp)))
{
die(sprintf("XML error: %s at line %d",
xml_error_string(xml_get_error_code($this->p)),
xml_get_current_line_number($this->p)));
}
}
}
else
{
xml_parse_into_struct($this->p, implode("", file($file)), $val, $inx);
print_r($val);
print_r($inx);
}
}
public function getPubs()
{
return $this->publications;
}
public function startElement($p, $element, $attrib)
{
$this->data;
$this->debugPrint("starting element: $element with data: *$this->data*".PHP_EOL);
switch($element)
{
case 'publication':
$this->inEntry = true;
$type = array_key_exists('type', $attrib) ? $attrib['type'] : die(PubsParser::NO_PUB_TYPE." on line: ".xml_get_current_line_number($p));
$this->curPubCount++;
switch($type)
{
case 'article':
$this->curPub = new Article();
break;
case 'bookcontrib':
$this->curPub = new BookContrib();
break;
case 'patent':
$this->curPub = new Patent();
break;
case 'patentpub':
$this->curPub = new PatentPub();
break;
}
$this->publication[] = $this->curPub;
break;
case 'publist':
break;
default:
if($this->inEntry)
{
switch($element)
{
case 'author':
$this->curAuthor = array();
break;
case 'book':
$this->inBook = true;
$this->curPub->setBook(new Book());
$this->publication[] = $this->curPub->getBook();
$this->curPub = $this->curPub->getBook();
break;
}
}
else
{
echo $element;
echo 'not in entry';
}
break;
}
}
public function endElement($p, $element)
{
$this->debugPrint("ending element: $element with data of: $data".PHP_EOL);
if($element == 'publication')
{
$this->curPub = array_pop($this->publication);
$this->publications[] = $this->curPub;
}
$curPub = $this->curPub;
$data = $this->data;
$data = trim($data);
switch($element)
{
case 'title':
$curPub->setTitle($data);
break;
case 'author':
$curPub->addAuthor($this->curAuthor);
break;
case 'first':
case 'middle':
case 'last':
case 'suffix':
$this->curAuthor[$element] = $data;
break;
case 'journal':
$curPub->setJournal($data);
break;
case 'year':
$curPub->setYear($data);
break;
case 'volume':
$curPub->setVolume($data);
break;
case 'spage':
case 'epage':
$curPub->addPage($data);
break;
case 'note':
$curPub->setNote($data);
break;
case 'book':
array_pop($this->publication);
$curPub = $this->publication[count($this->publication)-1];
break;
case 'publisher':
$curPub->setPublisher($data);
break;
case 'location':
$curPub->setLocation($data);
break;
case 'series':
$curPub->setSeries($data);
break;
case 'country':
$curPub->setCountry($data);
break;
case 'patentnum':
$curPub->setPatentNum($data);
break;
case 'date':
$curPub->setDate($data);
break;
case 'pagecount':
$curPub->setPageCount($data);
break;
case 'link':
$curPub->setLink($data);
break;
case 'patenttype':
$curPub->setPatentType($data);
break;
case 'volsupplement':
$curPub->setVolSupplement($data);
break;
}
$this->data = "";
}
public function contents($p, $content)
{
$this->debugPrint("writing contents: *$content*; to data: *$this->data*".PHP_EOL);
$this->data .= $content;
$this->debugPrint("data now contains: *$this->data*".PHP_EOL);
}
}
The functional approach
$xmlDebugOutput = false;
$xmlDebug = false;
$data = "";
$inEntry = false;
$curPubCount = 0;
$curPub;
$publication;
$curAuthor;
$inBook;
$publications = array();
DEFINE('NO_PUB_TYPE','No publication type declared');
function parseFile()
{
global $xmlDebug;
$p = xml_parser_create();
$file = "pubs.xml";
xml_parser_set_option($p, XML_OPTION_SKIP_WHITE, 1);
xml_parser_set_option($p, XML_OPTION_CASE_FOLDING, 0);
if(!$xmlDebug)
{
xml_set_element_handler($p, 'startElement', 'endElement');
xml_set_character_data_handler($p, 'contents');
$fp = fopen($file, "r") or die("Could not open file");
while($data = fread($fp, filesize($file)))
{
if(!xml_parse($p, $data, feof($fp)))
{
die(sprintf("XML error: %s at line %d",
xml_error_string(xml_get_error_code($p)),
xml_get_current_line_number($p)));
}
}
}
else
{
xml_parse_into_struct($p, implode("", file($file)), $val, $inx);
print_r($val);
print_r($inx);
}
}
function startElement($p, $element, $attrib)
{
global $data, $inEntry, $curPubCount, $curPub, $publication, $curAuthor, $inBook;
$data = "";
debugPrint("starting element: $element with data: *$data*".PHP_EOL);
switch($element)
{
case 'publication':
$inEntry = true;
$type = array_key_exists('type', $attrib) ? $attrib['type'] : die(NO_PUB_TYPE." on line: ".xml_get_current_line_number($p));
$curPubCount++;
switch($type)
{
case 'article':
$curPub = new Article();
break;
case 'bookcontrib':
$curPub = new BookContrib();
break;
case 'patent':
$curPub = new Patent();
break;
case 'patentpub':
$curPub = new PatentPub();
break;
}
$publication[] = $curPub;
break;
case 'publist':
break;
default:
if($inEntry)
{
switch($element)
{
case 'author':
$curAuthor = array();
break;
case 'book':
$inBook = true;
$curPub->setBook(new Book());
$publication[] = $curPub->getBook();
$curPub = $curPub->getBook();
break;
}
}
else
{
echo $element;
echo 'not in entry';
}
break;
}
}
function endElement($p, $element)
{
global $curPub, $publications, $data, $curAuthor, $publication;
debugPrint("ending element: $element with data of: $data".PHP_EOL);
if($element == 'publication')
{
$curPub = array_pop($publication);
$publications[] = $curPub;
}
$curPub = $curPub;
$data = $data;
$data = trim($data);
switch($element)
{
case 'title':
$curPub->setTitle($data);
break;
case 'author':
$curPub->addAuthor($curAuthor);
break;
case 'first':
case 'middle':
case 'last':
case 'suffix':
$curAuthor[$element] = $data;
break;
case 'journal':
$curPub->setJournal($data);
break;
case 'year':
$curPub->setYear($data);
break;
case 'volume':
$curPub->setVolume($data);
break;
case 'spage':
case 'epage':
$curPub->addPage($data);
break;
case 'note':
$curPub->setNote($data);
break;
case 'book':
array_pop($publication);
$curPub = $publication[count($publication)-1];
break;
case 'publisher':
$curPub->setPublisher($data);
break;
case 'location':
$curPub->setLocation($data);
break;
case 'series':
$curPub->setSeries($data);
break;
case 'country':
$curPub->setCountry($data);
break;
case 'patentnum':
$curPub->setPatentNum($data);
break;
case 'date':
$curPub->setDate($data);
break;
case 'pagecount':
$curPub->setPageCount($data);
break;
case 'link':
$curPub->setLink($data);
break;
case 'patenttype':
$curPub->setPatentType($data);
break;
case 'volsupplement':
$curPub->setVolSupplement($data);
break;
}
$data = "";
}
function contents($p, $content)
{
global $data;
debugPrint("writing contents: *$content*; to data: *$data*".PHP_EOL);
$data .= $content;
debugPrint("data now contains: *$data*".PHP_EOL);
}
function debugPrint($s)
{
global $xmlDebugOutput;
if($xmlDebugOutput)
{
echo $s;
}
}
Code that produces the output
if(isset($_GET['func']))
{
require_once("exPubs.php");
parseFile();
$pubs = $publications;
}
else
{
require_once("PubsParser.php");
$p = new PubsParser();
$p->parseFile();
$pubs = $p->getPubs();
}
foreach($pubs as $pub)
{
$pub->printPublication();
}
There is quite a bit more formatting that takes place in that foreach loop, but even at this stripped down version, the functional code runs in 530ms and the OO code runs in about 3.5s.
|
|