Prevents arbitrary code execution during python/object/new constructor (#386)

* Prevents arbitrary code execution during python/object/new constructor

In FullLoader python/object/new constructor, implemented by
construct_python_object_apply, has support for setting the state of a
deserialized instance through the set_python_instance_state method.
After setting the state, some operations are performed on the instance
to complete its initialization, however it is possible for an attacker
to set the instance' state in such a way that arbitrary code is executed
by the FullLoader.

This patch tries to block such attacks in FullLoader by preventing
set_python_instance_state from setting arbitrary properties. It
implements a blacklist that includes `extend` method (called by
construct_python_object_apply) and all special methods (e.g. __set__,
__setitem__, etc.).

Users who need special attributes being set in the state of a
deserialized object can still do it through the UnsafeLoader, which
however should not be used on untrusted input. Additionally, they can
subclass FullLoader and redefine `get_state_keys_blacklist()` to
extend/replace the list of blacklisted keys, passing the subclassed
loader to yaml.load.

* Make sure python/object/new constructor does not set some properties

* Add test to show how to subclass FullLoader with new blacklist
diff --git a/lib/yaml/constructor.py b/lib/yaml/constructor.py
index 859c949..a54758d 100644
--- a/lib/yaml/constructor.py
+++ b/lib/yaml/constructor.py
@@ -33,6 +33,14 @@
         # If there are more documents available?
         return self.check_node()
 
+    def check_state_key(self, key):
+        """Block special attributes/methods from being set in a newly created
+        object, to prevent user-controlled methods from being called during
+        deserialization"""
+        if self.get_state_keys_blacklist_regexp().match(key):
+            raise ConstructorError(None, None,
+                "blacklisted key '%s' in instance state found" % (key,), None)
+
     def get_data(self):
         # Construct and return the next document.
         if self.check_node():
@@ -471,6 +479,16 @@
         SafeConstructor.construct_undefined)
 
 class FullConstructor(SafeConstructor):
+    # 'extend' is blacklisted because it is used by
+    # construct_python_object_apply to add `listitems` to a newly generate
+    # python instance
+    def get_state_keys_blacklist(self):
+        return ['^extend$', '^__.*__$']
+
+    def get_state_keys_blacklist_regexp(self):
+        if not hasattr(self, 'state_keys_blacklist_regexp'):
+            self.state_keys_blacklist_regexp = re.compile('(' + '|'.join(self.get_state_keys_blacklist()) + ')')
+        return self.state_keys_blacklist_regexp
 
     def construct_python_str(self, node):
         return self.construct_scalar(node).encode('utf-8')
@@ -566,7 +584,7 @@
         else:
             return cls(*args, **kwds)
 
-    def set_python_instance_state(self, instance, state):
+    def set_python_instance_state(self, instance, state, unsafe=False):
         if hasattr(instance, '__setstate__'):
             instance.__setstate__(state)
         else:
@@ -574,10 +592,15 @@
             if isinstance(state, tuple) and len(state) == 2:
                 state, slotstate = state
             if hasattr(instance, '__dict__'):
+                if not unsafe and state:
+                    for key in state.keys():
+                        self.check_state_key(key)
                 instance.__dict__.update(state)
             elif state:
                 slotstate.update(state)
             for key, value in slotstate.items():
+                if not unsafe:
+                    self.check_state_key(key)
                 setattr(object, key, value)
 
     def construct_python_object(self, suffix, node):
@@ -699,6 +722,10 @@
         return super(UnsafeConstructor, self).make_python_instance(
             suffix, node, args, kwds, newobj, unsafe=True)
 
+    def set_python_instance_state(self, instance, state):
+        return super(UnsafeConstructor, self).set_python_instance_state(
+            instance, state, unsafe=True)
+
 UnsafeConstructor.add_multi_constructor(
     u'tag:yaml.org,2002:python/object/apply:',
     UnsafeConstructor.construct_python_object_apply)
diff --git a/lib3/yaml/constructor.py b/lib3/yaml/constructor.py
index fb4f1e9..40974af 100644
--- a/lib3/yaml/constructor.py
+++ b/lib3/yaml/constructor.py
@@ -31,6 +31,14 @@
         # If there are more documents available?
         return self.check_node()
 
+    def check_state_key(self, key):
+        """Block special attributes/methods from being set in a newly created
+        object, to prevent user-controlled methods from being called during
+        deserialization"""
+        if self.get_state_keys_blacklist_regexp().match(key):
+            raise ConstructorError(None, None,
+                "blacklisted key '%s' in instance state found" % (key,), None)
+
     def get_data(self):
         # Construct and return the next document.
         if self.check_node():
@@ -471,6 +479,16 @@
         SafeConstructor.construct_undefined)
 
 class FullConstructor(SafeConstructor):
+    # 'extend' is blacklisted because it is used by
+    # construct_python_object_apply to add `listitems` to a newly generate
+    # python instance
+    def get_state_keys_blacklist(self):
+        return ['^extend$', '^__.*__$']
+
+    def get_state_keys_blacklist_regexp(self):
+        if not hasattr(self, 'state_keys_blacklist_regexp'):
+            self.state_keys_blacklist_regexp = re.compile('(' + '|'.join(self.get_state_keys_blacklist()) + ')')
+        return self.state_keys_blacklist_regexp
 
     def construct_python_str(self, node):
         return self.construct_scalar(node)
@@ -573,7 +591,7 @@
         else:
             return cls(*args, **kwds)
 
-    def set_python_instance_state(self, instance, state):
+    def set_python_instance_state(self, instance, state, unsafe=False):
         if hasattr(instance, '__setstate__'):
             instance.__setstate__(state)
         else:
@@ -581,10 +599,15 @@
             if isinstance(state, tuple) and len(state) == 2:
                 state, slotstate = state
             if hasattr(instance, '__dict__'):
+                if not unsafe and state:
+                    for key in state.keys():
+                        self.check_state_key(key)
                 instance.__dict__.update(state)
             elif state:
                 slotstate.update(state)
             for key, value in slotstate.items():
+                if not unsafe:
+                    self.check_state_key(key)
                 setattr(object, key, value)
 
     def construct_python_object(self, suffix, node):
@@ -710,6 +733,10 @@
         return super(UnsafeConstructor, self).make_python_instance(
             suffix, node, args, kwds, newobj, unsafe=True)
 
+    def set_python_instance_state(self, instance, state):
+        return super(UnsafeConstructor, self).set_python_instance_state(
+            instance, state, unsafe=True)
+
 UnsafeConstructor.add_multi_constructor(
     'tag:yaml.org,2002:python/object/apply:',
     UnsafeConstructor.construct_python_object_apply)
diff --git a/tests/data/myfullloader.subclass_blacklist b/tests/data/myfullloader.subclass_blacklist
new file mode 100644
index 0000000..555a2b3
--- /dev/null
+++ b/tests/data/myfullloader.subclass_blacklist
@@ -0,0 +1,5 @@
+- !!python/object/new:yaml.MappingNode
+  args:
+  state:
+    mymethod: test
+    wrong_method: test2
diff --git a/tests/data/overwrite-state-new-constructor.loader-error b/tests/data/overwrite-state-new-constructor.loader-error
new file mode 100644
index 0000000..8d224f1
--- /dev/null
+++ b/tests/data/overwrite-state-new-constructor.loader-error
@@ -0,0 +1,5 @@
+- !!python/object/new:yaml.MappingNode
+  args:
+  state:
+    extend: test
+    __test__: test
diff --git a/tests/lib/test_constructor.py b/tests/lib/test_constructor.py
index beee7b0..a18d13d 100644
--- a/tests/lib/test_constructor.py
+++ b/tests/lib/test_constructor.py
@@ -17,7 +17,7 @@
     global MyLoader, MyDumper, MyTestClass1, MyTestClass2, MyTestClass3, YAMLObject1, YAMLObject2,  \
             AnObject, AnInstance, AState, ACustomState, InitArgs, InitArgsWithState,    \
             NewArgs, NewArgsWithState, Reduce, ReduceWithState, MyInt, MyList, MyDict,  \
-            FixedOffset, today, execute
+            FixedOffset, today, execute, MyFullLoader
 
     class MyLoader(yaml.Loader):
         pass
@@ -213,6 +213,10 @@
         def dst(self, dt):
             return datetime.timedelta(0)
 
+    class MyFullLoader(yaml.FullLoader):
+        def get_state_keys_blacklist(self):
+            return super(MyFullLoader, self).get_state_keys_blacklist() + ['^mymethod$', '^wrong_.*$']
+
     today = datetime.date.today()
 
 def _load_code(expression):
@@ -267,6 +271,18 @@
 
 test_constructor_types.unittest = ['.data', '.code']
 
+def test_subclass_blacklist_types(data_filename, verbose=False):
+    _make_objects()
+    try:
+        yaml.load(open(data_filename, 'rb').read(), MyFullLoader)
+    except yaml.YAMLError as exc:
+        if verbose:
+            print("%s:" % exc.__class__.__name__, exc)
+    else:
+        raise AssertionError("expected an exception")
+
+test_subclass_blacklist_types.unittest = ['.subclass_blacklist']
+
 if __name__ == '__main__':
     import sys, test_constructor
     sys.modules['test_constructor'] = sys.modules['__main__']
diff --git a/tests/lib3/test_constructor.py b/tests/lib3/test_constructor.py
index 427f53c..fb4509e 100644
--- a/tests/lib3/test_constructor.py
+++ b/tests/lib3/test_constructor.py
@@ -14,7 +14,7 @@
     global MyLoader, MyDumper, MyTestClass1, MyTestClass2, MyTestClass3, YAMLObject1, YAMLObject2,  \
             AnObject, AnInstance, AState, ACustomState, InitArgs, InitArgsWithState,    \
             NewArgs, NewArgsWithState, Reduce, ReduceWithState, MyInt, MyList, MyDict,  \
-            FixedOffset, today, execute
+            FixedOffset, today, execute, MyFullLoader
 
     class MyLoader(yaml.Loader):
         pass
@@ -200,6 +200,10 @@
         def dst(self, dt):
             return datetime.timedelta(0)
 
+    class MyFullLoader(yaml.FullLoader):
+        def get_state_keys_blacklist(self):
+            return super().get_state_keys_blacklist() + ['^mymethod$', '^wrong_.*$']
+
     today = datetime.date.today()
 
 def _load_code(expression):
@@ -252,6 +256,18 @@
 
 test_constructor_types.unittest = ['.data', '.code']
 
+def test_subclass_blacklist_types(data_filename, verbose=False):
+    _make_objects()
+    try:
+        yaml.load(open(data_filename, 'rb').read(), MyFullLoader)
+    except yaml.YAMLError as exc:
+        if verbose:
+            print("%s:" % exc.__class__.__name__, exc)
+    else:
+        raise AssertionError("expected an exception")
+
+test_subclass_blacklist_types.unittest = ['.subclass_blacklist']
+
 if __name__ == '__main__':
     import sys, test_constructor
     sys.modules['test_constructor'] = sys.modules['__main__']