import java.io.*; import java.util.*; import javax.xml.parsers.*; import org.xml.sax.*; import org.xml.sax.helpers.*; public class Tree2GraphParser extends DefaultHandler { public static String inputfile = "uwm.xml"; public static String outputfile = "output.txt"; public static String indexfile = "index.txt"; public class Vertex { int id; String text; long start_pos; // notice the start_pos is the start row of the subtree long end_pos; // the end_pos is the end row of the subtree int parent; Vector children; Vertex(int id, String text) { this.id = id; this.text = text; children = new Vector(); } public String toString() { StringBuffer strBuf=new StringBuffer(); text = text.replaceAll("[\n\r]", " "); text = text.replaceAll("<", "<"); text = text.replaceAll(">", ">"); strBuf.append(id+"\t"+"<"+text+"> "+start_pos+" "+end_pos+" "+parent+" "+children.size()); for(Integer i:children) strBuf.append(" "+i); return strBuf.toString(); } } private int nxtID = 0; private int pos = 0; Stack path = new Stack(); StringBuffer buf = new StringBuffer(); FileWriter writer = null; FileWriter index_writer = null; public void startDocument() { try { writer = new FileWriter(outputfile); index_writer = new FileWriter(indexfile); } catch (IOException ioe) { ioe.printStackTrace(); System.exit(-1); } } private void vertexGene(String qName) { // push the vertex into the path. Vertex v = new Vertex(nxtID, qName); nxtID++; if (path.isEmpty()) { // This is the root node. v.parent = -1;// root's parent is denoted as -1 } else { Vertex parent = path.lastElement(); parent.children.add(v.id); v.parent = parent.id; } path.push(v); } private void outputVertex() { // pop the path and write the file. Vertex v = path.pop(); try { index_writer.write(v + "\n"); index_writer.flush(); } catch (IOException ioe) { ioe.printStackTrace(); System.exit(-1); } } public void startElement(String uri, String localName, String qName, Attributes attributes) { try { vertexGene(qName); path.lastElement().start_pos = pos; writer.write("<" + qName + ">"); pos += qName.length() + 2; for (int i = 0; i < attributes.getLength(); i++) { String attrKey = attributes.getQName(i); String attrVal = attributes.getValue(i); vertexGene(attrKey); path.lastElement().start_pos = pos; writer.write("<" + attrKey + ">"); pos = pos + attrKey.length() + 2; vertexGene(attrVal); path.lastElement().start_pos = pos; writer.write(attrVal); pos = pos + attrVal.length(); path.lastElement().end_pos = pos; outputVertex(); writer.write(""); pos = pos + attrKey.length() + 3; path.lastElement().end_pos = pos; outputVertex(); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public void characters(char[] ch, int start, int length) throws SAXException { buf.append(new String(ch, start, length).trim()); } public void endElement(String uri, String localName, String qName) { try { if (!buf.toString().trim().equals("")) { vertexGene(buf.toString()); path.lastElement().start_pos = pos; writer.write(buf.toString()); pos = pos + buf.toString().length(); path.lastElement().end_pos = pos; outputVertex(); } buf = new StringBuffer(); writer.write(""); pos = pos + qName.length() + 3; path.lastElement().end_pos = pos; outputVertex(); } catch (IOException e) { e.printStackTrace(); } } public void endDocument() { // close the file; try { index_writer.close(); writer.close(); } catch (IOException ioe) { ioe.printStackTrace(); System.exit(-1); } } public static void getXML(String input_file_name, int start_pos, int end_pos, String out_file_name){ try{ RandomAccessFile index = new RandomAccessFile(input_file_name, "r"); index.seek(start_pos); byte [] bytes = new byte[end_pos - start_pos]; index.read(bytes); FileWriter output = new FileWriter(out_file_name); String out = new String(bytes); output.write(out); output.close(); }catch(Exception e){ e.printStackTrace(); System.exit(-1); } } public static void main(String args[]) throws ParserConfigurationException, SAXException, IOException { //inputfile = args[0]; //outputfile = args[1]; //indexfile = args[2]; SAXParserFactory factory = SAXParserFactory.newInstance(); SAXParser parser = factory.newSAXParser(); Tree2GraphParser graphParser = new Tree2GraphParser(); long start = System.currentTimeMillis(); parser.parse(new File(inputfile), graphParser); long end = System.currentTimeMillis(); System.out.println("Processing time: " + (end - start) + " ms"); /* getXML("output.txt", 67, 270, "out.xml"); */ } }