From ec5c49eaff1c54ff779bf36ab1212a749d05e185 Mon Sep 17 00:00:00 2001
From: Wesley Shields <wxs@atarininja.org>
Date: Mon, 30 Dec 2013 16:45:50 -0500
Subject: [PATCH] Make resource parsing more resilient.

I have a UPX packed sample that corrupted the resource directory. These changes
allow the resources to be properly parsed.

They add an RVA and size to the resource struct. This is the address and size
of the resource as it is declared in the directory. If the address is invalid
create a zero-length buffer for the data. If the size is invalid (ie: it goes
off the end of the .rsrc section) create a zero-length buffer for the data.
Otherwise, return the actual data.

This allows consumers of the rsrc to figure out if the resource is corrupt
or not by comparing the length of the buffer to the size element. If the
size is greater than 0 but buffer is empty then it's invalid.

Also, it should never happen but just to be safe make pepy catch NULL
buffers (in pepy_data_converter) and return an empty bytearray.
---
 dump-prog/dump.cpp       |  2 ++
 parser-library/parse.cpp | 19 +++++++++++++++++--
 parser-library/parse.h   |  2 ++
 python/pepy.cpp          | 30 ++++++++++++++++++++++++++----
 python/test.py           |  2 ++
 5 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/dump-prog/dump.cpp b/dump-prog/dump.cpp
index fe170f0..953ea30 100644
--- a/dump-prog/dump.cpp
+++ b/dump-prog/dump.cpp
@@ -105,6 +105,8 @@ int printRsrc(void     *N,
   else
     cout << "Lang: " << to_string<uint32_t>(r.lang, hex) << endl;
   cout << "Codepage: " << to_string<uint32_t>(r.codepage, hex) << endl;
+  cout << "RVA: " << to_string<uint32_t>(r.RVA, dec) << endl;
+  cout << "Size: " << to_string<uint32_t>(r.size, dec) << endl;
   return 0;
 }
 
diff --git a/parser-library/parse.cpp b/parser-library/parse.cpp
index 51d6d37..2b0dd43 100644
--- a/parser-library/parse.cpp
+++ b/parser-library/parse.cpp
@@ -219,14 +219,29 @@ bool parse_resource_table(bounded_buffer *sectionData, ::uint32_t o, ::uint32_t
       rsrc.name = rde->name;
       rsrc.lang = rde->lang;
       rsrc.codepage = rdat.codepage;
+      rsrc.RVA = rdat.RVA;
+      rsrc.size = rdat.size;
 
       // The start address is (RVA - section virtual address).
       uint32_t start = rdat.RVA - virtaddr;
+      /*
+       * Some binaries (particularly packed) will have invalid addresses here.
+       * If those happen, return a zero length buffer.
+       * If the start is valid, try to get the data and if that fails return
+       * a zero length buffer.
+       */
       if (start > rdat.RVA)
-        return false;
-      rsrc.buf = splitBuffer(sectionData, start, start + rdat.size);
+        rsrc.buf = splitBuffer(sectionData, 0, 0);
+      else {
+        rsrc.buf = splitBuffer(sectionData, start, start + rdat.size);
+        if (!rsrc.buf)
+          rsrc.buf = splitBuffer(sectionData, 0, 0);
+      }
+
+      /* If we can't get even a zero length buffer, something is very wrong. */
       if (!rsrc.buf)
         return false;
+
       rsrcs.push_back(rsrc);
     }
   }
diff --git a/parser-library/parse.h b/parser-library/parse.h
index 8ae7df2..8bc5bf5 100644
--- a/parser-library/parse.h
+++ b/parser-library/parse.h
@@ -49,6 +49,8 @@ struct resource {
   boost::uint32_t name;
   boost::uint32_t lang;
   boost::uint32_t codepage;
+  boost::uint32_t RVA;
+  boost::uint32_t size;
   bounded_buffer  *buf;
 };
 
diff --git a/python/pepy.cpp b/python/pepy.cpp
index 8622c80..9304530 100644
--- a/python/pepy.cpp
+++ b/python/pepy.cpp
@@ -76,6 +76,8 @@ typedef struct {
 	PyObject *name;
 	PyObject *lang;
 	PyObject *codepage;
+	PyObject *RVA;
+	PyObject *size;
 	PyObject *data;
 } pepy_resource;
 
@@ -425,7 +427,7 @@ static PyObject *pepy_resource_new(PyTypeObject *type, PyObject *args, PyObject
 }
 
 static int pepy_resource_init(pepy_resource *self, PyObject *args, PyObject *kwds) {
-	if (!PyArg_ParseTuple(args, "OOOOOOOO:pepy_resource_init", &self->type_str, &self->name_str, &self->lang_str, &self->type, &self->name, &self->lang, &self->codepage, &self->data))
+	if (!PyArg_ParseTuple(args, "OOOOOOOOOO:pepy_resource_init", &self->type_str, &self->name_str, &self->lang_str, &self->type, &self->name, &self->lang, &self->codepage, &self->RVA, &self->size, &self->data))
 		return -1;
 
 	return 0;
@@ -439,6 +441,8 @@ static void pepy_resource_dealloc(pepy_resource *self) {
 	Py_XDECREF(self->name);
 	Py_XDECREF(self->lang);
 	Py_XDECREF(self->codepage);
+	Py_XDECREF(self->RVA);
+	Py_XDECREF(self->size);
 	Py_XDECREF(self->data);
 	self->ob_type->tp_free((PyObject *) self);
 }
@@ -450,6 +454,8 @@ PEPY_OBJECT_GET(resource, type)
 PEPY_OBJECT_GET(resource, name)
 PEPY_OBJECT_GET(resource, lang)
 PEPY_OBJECT_GET(resource, codepage)
+PEPY_OBJECT_GET(resource, RVA)
+PEPY_OBJECT_GET(resource, size)
 PEPY_OBJECT_GET(resource, data)
 
 static PyObject *pepy_resource_type_as_str(PyObject *self, PyObject *args) {
@@ -556,6 +562,8 @@ static PyGetSetDef pepy_resource_getseters[] = {
 	OBJECTGETTER(resource, name, "Name"),
 	OBJECTGETTER(resource, lang, "Language"),
 	OBJECTGETTER(resource, codepage, "Codepage"),
+	OBJECTGETTER(resource, RVA, "RVA"),
+	OBJECTGETTER(resource, size, "Size (specified in RDAT)"),
 	OBJECTGETTER(resource, data, "Resource data"),
 	{ NULL }
 };
@@ -703,11 +711,25 @@ static PyObject *pepy_parsed_get_bytes(PyObject *self, PyObject *args) {
 	return ret;
 }
 
-/* This is used to convert bounded buffers into python byte array objects. */
+/*
+ * This is used to convert bounded buffers into python byte array objects.
+ * In case the buffer is NULL, return an empty bytearray.
+ */
 static PyObject *pepy_data_converter(bounded_buffer *data) {
 	PyObject* ret;
+	const char *str;
+	Py_ssize_t len;
 
-	ret = PyByteArray_FromStringAndSize((const char *) data->buf, data->bufLen);
+	if (!data || !data->buf) {
+		str = "";
+		len = 0;
+	}
+	else {
+		str = (const char *) data->buf;
+		len = data->bufLen;
+	}
+
+	ret = PyByteArray_FromStringAndSize(str, len);
 	if (!ret) {
 		PyErr_SetString(pepy_error, "Unable to convert data to byte array.");
 		return NULL;
@@ -773,7 +795,7 @@ int resource_callback(void *cbd, resource r) {
 	 * The tuple item order is important here. It is passed into the
 	 * section type initialization and parsed there.
 	 */
-	tuple = Py_BuildValue("s#s#s#IIIIO&", r.type_str.c_str(), r.type_str.length(), r.name_str.c_str(), r.name_str.length(), r.lang_str.c_str(), r.lang_str.length(), r.type, r.name, r.lang, r.codepage, pepy_data_converter, r.buf);
+	tuple = Py_BuildValue("s#s#s#IIIIIIO&", r.type_str.c_str(), r.type_str.length(), r.name_str.c_str(), r.name_str.length(), r.lang_str.c_str(), r.lang_str.length(), r.type, r.name, r.lang, r.codepage, r.RVA, r.size, pepy_data_converter, r.buf);
 	if (!tuple)
 		return 1;
 
diff --git a/python/test.py b/python/test.py
index eec9d66..f31665c 100755
--- a/python/test.py
+++ b/python/test.py
@@ -86,3 +86,5 @@ for resource in resources:
     else:
         print "\tLang: %s" % hex(resource.lang)
     print "\tCodepage: %s" % hex(resource.codepage)
+    print "\tRVA: %s" % hex(resource.RVA)
+    print "\tSize: %s" % hex(resource.size)