From 4fcae76c4de937af26af78fe9e33215fda0ff6f0 Mon Sep 17 00:00:00 2001 From: huangsimin Date: Mon, 9 Dec 2019 18:02:36 +0800 Subject: [PATCH] init --- cfuns.go | 48 ++++++++++++ clib.go | 213 +++++++++++++++++++++++++++++++++++++++++++++++++++ clib_test.go | 9 +++ go.mod | 5 ++ go.sum | 2 + 5 files changed, 277 insertions(+) create mode 100644 cfuns.go create mode 100644 clib.go create mode 100644 clib_test.go create mode 100644 go.mod create mode 100644 go.sum diff --git a/cfuns.go b/cfuns.go new file mode 100644 index 0000000..c53b6c9 --- /dev/null +++ b/cfuns.go @@ -0,0 +1,48 @@ +package sax + +/* +#cgo pkg-config: libxml-2.0 +#include +#include +extern void goStartDocument(void*); +extern void goEndDocument(void*); +extern void goStartElement(void*, const xmlChar*, const xmlChar**, int); +extern void goStartElementNoAttr(void*, const xmlChar*); +extern void goEndElement(void*, const xmlChar*); +extern void goCharacters(void*, const xmlChar*, int); +extern void goCharactersRaw(void*, const xmlChar*, int); +void startDocumentCgo(void* user_data) { + goStartDocument(user_data); +} +void endDocumentCgo(void* user_data) { + goEndDocument(user_data); +} +void startElementCgo(void* user_data, + const xmlChar* name, + const xmlChar** attrs) { + // The attrs array is terminated with a NULL pointer. To make it usable in + // Go, we find the length and pass it explicitly to the Go callback. + int i = 0; + if (attrs != NULL) { + while (attrs[i] != NULL) { + i++; + } + } + goStartElement(user_data, name, attrs, i); +} +void startElementNoAttrCgo(void* user_data, + const xmlChar* name, + const xmlChar** attrs) { + goStartElementNoAttr(user_data, name); +} +void endElementCgo(void* user_data, const xmlChar* name) { + goEndElement(user_data, name); +} +void charactersCgo(void* user_data, const xmlChar* ch, int len) { + goCharacters(user_data, ch, len); +} +void charactersRawCgo(void* user_data, const xmlChar* ch, int len) { + goCharactersRaw(user_data, ch, len); +} +*/ +import "C" diff --git a/clib.go b/clib.go new file mode 100644 index 0000000..d401953 --- /dev/null +++ b/clib.go @@ -0,0 +1,213 @@ +package sax + +import ( + "fmt" + "strings" + "sync" + "unsafe" +) + + + +/* +#cgo pkg-config: libxml-2.0 +#include +#include +#include + +extern void startDocumentCgo(void*); +extern void endDocumentCgo(void*); +extern void startElementCgo(void*, const xmlChar*, const xmlChar**); +extern void startElementNoAttrCgo(void*, const xmlChar*, const xmlChar**); +extern void endElementCgo(void*, const xmlChar*); +extern void charactersCgo(void*, const xmlChar*, int); +extern void charactersRawCgo(void*, const xmlChar*, int); +// Since this structure contains pointers, take extra care to zero it out +// before passing it to Go code. +static inline xmlSAXHandler newHandlerStruct() { + xmlSAXHandler h = {0}; + return h; +} +// Wrap a C macro in a function callable from Go. +static inline xmlError* getLastError() { + return xmlGetLastError(); +} +*/ +import "C" + +import "github.com/eliben/gosax/pointer" + +// Used to ensure that xmlInitParser is only called once. +var initOnce sync.Once + +func init() { + initOnce.Do(func() { + C.xmlInitParser() + }) +} + +// SaxCallbacks collects callback functions to invoke on SAX events. Only +// populate callbacks you're interested in - callbacks left as nil will not +// be registered with the C layer and may save processing time. +// Some callbacks override others for optimization purposes - check the comments +// for more information. +type SaxCallbacks struct { + // StartDocument is invoked on the "start document" event. + StartDocument StartDocumentFunc + + // EndDocument is invoked on the "end document" event + EndDocument EndDocumentFunc + + // StartElement is invoked whenever the beginning of a new element is found. + // name will be the element name, and attrs a slice of attributes where + // attribute names alternate with values. For example, given the element + // the callback will get name="elem" and + // attrs=["foo", "bar", "id", "100"]. + StartElement StartElementFunc + + // StartElementNoAttr will override StartElement, if set. When you don't + // care about the attributes of an element, use this one - it will be faster + // because it doesn't have to do attribute unpacking, which is expensive. + StartElementNoAttr StartElementNoAttrFunc + + // EndElement is invoked at the end of parsing an element (after closing tag + // has been processed), with name being the element name. + EndElement EndElementFunc + + // Characters is invoked on character data inside elements. contents is the + // data, as string. Note that this callback may be invoked multiple times + // within a single tag. + Characters CharactersFunc + + // CharactersRaw will override Characters, if set. It doesn't translate XML + // data into a Go string, but leaves it as an opaque pair of (ch, chlen), + // which you could use UnpackString to convert to a string if needed. This + // could be a useful optimization if you're only occasionally interested in + // the contents of character data. + CharactersRaw CharactersRawFunc +} + +type StartDocumentFunc func() +type EndDocumentFunc func() +type StartElementFunc func(name string, attrs []string) +type StartElementNoAttrFunc func(name string) +type EndElementFunc func(name string) +type CharactersFunc func(contents string) +type CharactersRawFunc func(ch unsafe.Pointer, chlen int) + +// UnpackString unpacks the opaque ch, chlen pair (that some callbacks in +// this package may create) into a Go string. +func UnpackString(ch unsafe.Pointer, chlen int) string { + return C.GoStringN((*C.char)(ch), C.int(chlen)) +} + +// ParseFile parses an XML file with the given name using SAX, with cb as +// the callbacks. The file name is required, rather than a reader, because it +// gets passed directly to the C layer. +func ParseFile(filename string, cb SaxCallbacks) error { + var cfilename *C.char = C.CString(filename) + defer C.free(unsafe.Pointer(cfilename)) + + // newHandlerStruct zeroes out all the pointers; we assign only those that + // are passed as non-nil in SaxCallbacks. + SAXhandler := C.newHandlerStruct() + + if cb.StartDocument != nil { + SAXhandler.startDocument = C.startDocumentSAXFunc(C.startDocumentCgo) + } + + if cb.EndDocument != nil { + SAXhandler.endDocument = C.endDocumentSAXFunc(C.endDocumentCgo) + } + + if cb.StartElement != nil { + SAXhandler.startElement = C.startElementSAXFunc(C.startElementCgo) + } + // StartElementNoAttr overrides StartElement + if cb.StartElementNoAttr != nil { + SAXhandler.startElement = C.startElementSAXFunc(C.startElementNoAttrCgo) + } + + if cb.EndElement != nil { + SAXhandler.endElement = C.endElementSAXFunc(C.endElementCgo) + } + + if cb.Characters != nil { + SAXhandler.characters = C.charactersSAXFunc(C.charactersCgo) + } + // CharactersRaw overrides Characters + if cb.CharactersRaw != nil { + SAXhandler.characters = C.charactersSAXFunc(C.charactersRawCgo) + } + + // Pack the callbacks structure into an opaque unsafe.Pointer which we'll + // pass to C as user_data, and C will pass it back to our Go callbacks. + user_data := pointer.Save(&cb) + defer pointer.Unref(user_data) + + rc := C.xmlSAXUserParseFile(&SAXhandler, user_data, cfilename) + if rc != 0 { + xmlErr := C.getLastError() + msg := strings.TrimSpace(C.GoString(xmlErr.message)) + return fmt.Errorf("line %v: error: %v", xmlErr.line, msg) + } + + return nil +} + +//export goStartDocument +func goStartDocument(user_data unsafe.Pointer) { + gcb := pointer.Restore(user_data).(*SaxCallbacks) + gcb.StartDocument() +} + +//export goEndDocument +func goEndDocument(user_data unsafe.Pointer) { + gcb := pointer.Restore(user_data).(*SaxCallbacks) + gcb.EndDocument() +} + +//export goStartElement +func goStartElement(user_data unsafe.Pointer, name *C.char, attrs **C.char, attrlen C.int) { + // Passing attrs to Go is tricky because it's an array of C strings, + // terminated with a NULL pointer. The C callback startElementCgo calculates + // the length of the array and passes it in as attrlen. We still have to + // convert it to a Go slice, by mapping a slice on the underlying storage + // and copying the attributes, one by one. This is all rather expensive, so + // consider using the StartElementNoAttr callback instead, when applicable. + gcb := pointer.Restore(user_data).(*SaxCallbacks) + length := int(attrlen) + var goattrs []string + if length > 0 { + tmpslice := (*[1 << 30]*C.char)(unsafe.Pointer(attrs))[:length:length] + goattrs = make([]string, length) + for i, s := range tmpslice { + goattrs[i] = C.GoString(s) + } + } + gcb.StartElement(C.GoString(name), goattrs) +} + +//export goStartElementNoAttr +func goStartElementNoAttr(user_data unsafe.Pointer, name *C.char) { + gcb := pointer.Restore(user_data).(*SaxCallbacks) + gcb.StartElementNoAttr(C.GoString(name)) +} + +//export goEndElement +func goEndElement(user_data unsafe.Pointer, name *C.char) { + gcb := pointer.Restore(user_data).(*SaxCallbacks) + gcb.EndElement(C.GoString(name)) +} + +//export goCharacters +func goCharacters(user_data unsafe.Pointer, ch *C.char, chlen C.int) { + gcb := pointer.Restore(user_data).(*SaxCallbacks) + gcb.Characters(C.GoStringN(ch, chlen)) +} + +//export goCharactersRaw +func goCharactersRaw(user_data unsafe.Pointer, ch *C.char, chlen C.int) { + gcb := pointer.Restore(user_data).(*SaxCallbacks) + gcb.CharactersRaw(unsafe.Pointer(ch), int(chlen)) +} diff --git a/clib_test.go b/clib_test.go new file mode 100644 index 0000000..cc5b066 --- /dev/null +++ b/clib_test.go @@ -0,0 +1,9 @@ +package sax + +import ( + "testing" +) + +func Test1(t *testing.T) { + +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..a3a4256 --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module sax + +go 1.13 + +require github.com/eliben/gosax v0.1.0 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..9340d77 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +github.com/eliben/gosax v0.1.0 h1:nJWm362/rDeiKF2yjzJv/C3PZe7KEfbbYA6f8lbRNOY= +github.com/eliben/gosax v0.1.0/go.mod h1:iPWVK8Lib0moJouNU/XW5qwKVfBABjm/i66XDZ+Mq0c=