IT.COM

Complete XML parser

Spaceship
Watch
Impact
62
This class will completely parse an XML file and put it into an array format with the same structure as the XML document, so you can easily use it. For an example of the array generated, click here to see what it does to my FOAF file.

PHP:
<?php

// The XML parser class
class XMLparser
{
  var $url; // The URL where the XML file is to be found.
  var $xml; // The XML data.
  var $parser; // The parser object.
  var $data_array; // The data array generated by the parser.
  var $data_processed; // The array of elements which have been parsed.
                       // This is (probably) obsolete after my latest modification.
  var $index_array; // The index array generated by the parser.
  var $xml_array; // The array of XML data generated by parse_element.

  // Display an XML error
  function print_error(){ 
	die(sprintf("XML Error: %s at line %d", 
				xml_error_string(xml_get_error_code($this->parser)), 
				xml_get_current_line_number($this->parser) 
				)); 
	
	return false; 
  }

  // Download a file from a remote URL
  function get_file($url)
  {
	if(function_exists('curl_init'))
	  {
		$options = array(CURLOPT_RETURNTRANSFER => true,
						 CURLOPT_HEADER         => false,
						 CURLOPT_ENCODING       => "",
						 CURLOPT_USERAGENT      => "xml-spider",
						 CURLOPT_AUTOREFERER    => true,
						 CURLOPT_CONNECTTIMEOUT => 120,
						 CURLOPT_TIMEOUT        => 120,
						 CURLOPT_MAXREDIRS      => 10);

		$ch = curl_init($url);
		curl_setopt_array($ch, $options);
		$content = curl_exec($ch);
		$err = curl_errno($ch);
		$errmsg = curl_error($ch);
		$header = curl_getinfo($ch);
		curl_close($ch);

		if(!empty($err) && !empty($errmsg))
		  {
			$header['errno'] = $err;
			$header['errmsg'] = $errmsg;
			print_r($header);
			exit;
		  }

		return $content;
	  }
	else
	  {
		// Fall back to file_get_contents if cURL isn't available.
		return file_get_contents($url);
	  }
  }

  // Download an XML file and pass it to the parse_xml function
  function parse_url($url)
  {
	$this->url = $url;
	return $this->parse_xml($this->get_file($url));
  }

  // Create an XML parser and have it parse the XML data.
  // Then pass the data to the parse_element function.
  function parse_xml($xml)
  {
	$this->xml = $xml;

	$this->parser = xml_parser_create();
	xml_parser_set_option($this->parser, XML_OPTION_SKIP_WHITE, 1);
	xml_parser_set_option($this->parser, XML_OPTION_CASE_FOLDING, 0);
	xml_parse_into_struct($this->parser, $this->xml, $this->data_array, $this->index_array) or $this->print_error();

	//print_r($this->data_array);
	//print_r($this->index_array);

	$this->data_processed = array();
	$this->xml_array = $this->parse_element($this->data_array[0]);

	//print_r($this->data_processed);
	
	return $this->xml_array;
  }

  // Parse the XML, one element at a time, fetching the tag name, attributes,
  // nesting level, value, and children (recursively).
  function parse_element($element)
  {
	//print_r($element);
	$this->data_processed[] = $element;
	$output = array('tag' => $element['tag'],
					'attributes' => (isset($element['attributes'])) ? $element['attributes'] : null,
					'level' => $element['level'],
					'value' => (isset($element['value'])) ? $element['value'] : null,
					'children' => array());

	//echo "{$output['level']}<br/>";

	$break = false;
	$started = false;
	$i = 0;
	foreach($this->data_array as $ele)
	  {
		if(!empty($ele) && $ele['type'] != 'close')
		  {
			if($ele['level'] == $element['level'] + 1 && $break == false && $started == true)
			  {
				$output['children'][] = $this->parse_element($ele);
			  }
			else
			  {
				//echo "{$ele['level']}:{$element['level']}:{$break}<br/>";
				if($ele == $element)
				  {
					$started = true;
				  }
				if($ele != $element && $ele['level'] <= $element['level'] && !in_array($ele, $this->data_processed))
				  {
					//print_r($ele);
					$break = true;
				  }
				if($started == true && $ele['level'] == $element['level'] && $ele != $element)
				  {
					$started = false;
				  }
			  }
		  }
		else
		  {
			if(!empty($ele) && $started == true && $ele['level'] == $element['level'])
			  {
				$started = false;
				$break = true;
			  }
			//echo "{$ele['type']}<br/>";
		  }
		if($ele['level'] == $element['level'] + 1 && $break == false && $started == true)
		  {
			$this->data_array[$i] = null;
		  }
		$i ++;
	  }

	if($output['children'] == array())
	  {
		$output['children'] = null;
	  }

	return $output;
  }

  // Convert an array generated by parse_element into an XML document.
  function to_xml($element)
  {
	$spaces = '';
	for($i = 0; $i < $element['level'] - 1; $i ++)
	  {
		$spaces .= '  ';
	  }

	$output = "{$spaces}<{$element['tag']}";

	if(is_array($element['attributes']))
	  {
		foreach($element['attributes'] as $attribute => $value)
		  {
			$safevalue = htmlentities($value);
			$output .= " {$attribute}=\"{$safevalue}\"";
		  }
	  }

	if(is_array($element['children']) || !empty($element['value']))
	  {
		$output .= ">";

		if(!empty($element['value']))
		  {
			$output .= $element['value'];
		  }
		
		if(is_array($element['children']))
		  {
			$output .= "\n";
			foreach($element['children'] as $child)
			  {
				$output .= $this->to_xml($child);
			  }
		  }

		$output .= "</{$element['tag']}>";
	  }
	else
	  {
		$output .= " />";
	  }
	$output .= "\n";
	return $output;
  }
}
?>
Here's two examples:
PHP:
// Create an instance of the class
$parser = new XMLparser;

// Example One
print_r($parser->parse_url('http://www.barrucadu.co.uk/foaf.rdf')); // Convert a remote XML file into an array.

// Example Two
echo $parser->to_xml($parser->parse_url('http://www.barrucadu.co.uk/foaf.rdf')); // Convert a remote XML file into an array and back again.
And here's a sanity check if you want to make sure it's working properly:
PHP:
// Sanity check:
$parsed = array('once' => $parser->parse_url('http://www.barrucadu.co.uk/foaf.rdf'),
				'twice' => $parser->parse_xml($parser->to_xml($parser->parse_url('http://www.barrucadu.co.uk/foaf.rdf'))));
if($parsed['once'] != $parsed['twice'])
  {
	echo "Problem!";
  }
else
  {
	echo "Fine!";
  }

Now I can finally get started with my FOAF file parser... :p
 
Last edited:
0
•••
The views expressed on this page by users and staff are their own, not those of NamePros.
  • The sidebar remains visible by scrolling at a speed relative to the page’s height.
Back