| 
14 | 14 | from pandas.util.testing import (assert_frame_equal,  | 
15 | 15 |                                  assert_series_equal,  | 
16 | 16 |                                  slow)  | 
 | 17 | +from pandas.types.dtypes import CategoricalDtype  | 
17 | 18 | from pandas import DataFrame, Index, MultiIndex, Series, Categorical  | 
18 | 19 | import pandas.util.testing as tm  | 
19 | 20 | 
 
  | 
@@ -1372,6 +1373,121 @@ def f():  | 
1372 | 1373 |         self.assertRaises(NotImplementedError, f)  | 
1373 | 1374 | 
 
  | 
1374 | 1375 | 
 
  | 
 | 1376 | +class TestMergeCategorical(tm.TestCase):  | 
 | 1377 | +    _multiprocess_can_split_ = True  | 
 | 1378 | + | 
 | 1379 | +    def setUp(self):  | 
 | 1380 | +        np.random.seed(1234)  | 
 | 1381 | +        self.left = DataFrame(  | 
 | 1382 | +            {'X': np.random.choice(['foo', 'bar'], size=(10,)),  | 
 | 1383 | +             'Y': np.random.choice(['one', 'two', 'three'], size=(10,))})  | 
 | 1384 | + | 
 | 1385 | +        self.right = pd.DataFrame(  | 
 | 1386 | +            {'X': np.random.choice(['foo', 'bar'], size=(10,)),  | 
 | 1387 | +             'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10,))})  | 
 | 1388 | + | 
 | 1389 | +    def test_identical(self):  | 
 | 1390 | +        # GH 10409  | 
 | 1391 | +        left = self.left.assign(X=self.left.X.astype('category'))  | 
 | 1392 | + | 
 | 1393 | +        merged = pd.merge(left, left, on='X')  | 
 | 1394 | +        result = merged.dtypes.sort_index()  | 
 | 1395 | +        expected = Series([CategoricalDtype(),  | 
 | 1396 | +                           np.dtype('O'),  | 
 | 1397 | +                           np.dtype('O')],  | 
 | 1398 | +                          index=['X', 'Y_x', 'Y_y'])  | 
 | 1399 | +        assert_series_equal(result, expected)  | 
 | 1400 | + | 
 | 1401 | +    def test_other_columns(self):  | 
 | 1402 | +        # non-merge columns should preserver if possible  | 
 | 1403 | +        x = self.left.X.astype('category')  | 
 | 1404 | +        left = DataFrame({'X': x, 'Y': x})  | 
 | 1405 | + | 
 | 1406 | +        merged = pd.merge(left, left, on='X')  | 
 | 1407 | +        result = merged.dtypes.sort_index()  | 
 | 1408 | +        expected = Series([CategoricalDtype(),  | 
 | 1409 | +                           CategoricalDtype(),  | 
 | 1410 | +                           CategoricalDtype()],  | 
 | 1411 | +                          index=['X', 'Y_x', 'Y_y'])  | 
 | 1412 | +        assert_series_equal(result, expected)  | 
 | 1413 | + | 
 | 1414 | +        # different categories  | 
 | 1415 | +        x = self.left.X.astype('category')  | 
 | 1416 | +        left = DataFrame(  | 
 | 1417 | +            {'X': x,  | 
 | 1418 | +             'Y': x.cat.set_categories(['bar', 'foo', 'bah'])})  | 
 | 1419 | +        right = self.right.drop_duplicates(['X'])  | 
 | 1420 | +        right = right.assign(  | 
 | 1421 | +            Y=pd.Series(['foo', 'foo']).astype(  | 
 | 1422 | +                'category', categories=['foo', 'bar', 'baz']))  | 
 | 1423 | + | 
 | 1424 | +        merged = pd.merge(left, right, on='X')  | 
 | 1425 | +        result = merged.dtypes.sort_index()  | 
 | 1426 | +        expected = Series([CategoricalDtype(),  | 
 | 1427 | +                           CategoricalDtype(),  | 
 | 1428 | +                           CategoricalDtype(),  | 
 | 1429 | +                           np.dtype('O')],  | 
 | 1430 | +                          index=['X', 'Y_x', 'Y_y', 'Z'])  | 
 | 1431 | +        assert_series_equal(result, expected)  | 
 | 1432 | + | 
 | 1433 | +    def test_categories_same(self):  | 
 | 1434 | +        # GH 10409  | 
 | 1435 | +        left = self.left.assign(X=self.left.X.astype('category'))  | 
 | 1436 | +        right = self.right.assign(X=self.right.X.astype('category'))  | 
 | 1437 | + | 
 | 1438 | +        merged = pd.merge(left, right, on='X')  | 
 | 1439 | +        result = merged.dtypes.sort_index()  | 
 | 1440 | +        expected = Series([CategoricalDtype(),  | 
 | 1441 | +                           np.dtype('O'),  | 
 | 1442 | +                           np.dtype('O')],  | 
 | 1443 | +                          index=['X', 'Y', 'Z'])  | 
 | 1444 | +        assert_series_equal(result, expected)  | 
 | 1445 | + | 
 | 1446 | +    def test_categories_different(self):  | 
 | 1447 | + | 
 | 1448 | +        r = self.right.drop_duplicates(['X'])  | 
 | 1449 | + | 
 | 1450 | +        # from above with original categories  | 
 | 1451 | +        left = self.left.assign(X=self.left.X.astype('category'))  | 
 | 1452 | + | 
 | 1453 | +        right = r.assign(X=r.X.astype('category'))  | 
 | 1454 | +        merged = pd.merge(left, right, on='X')  | 
 | 1455 | + | 
 | 1456 | +        # swap the categories  | 
 | 1457 | +        # but should still work (end return categorical)  | 
 | 1458 | +        left = self.left.assign(X=self.left.X.astype('category'))  | 
 | 1459 | +        right = r.assign(X=r.X.astype('category', categories=['foo', 'bar']))  | 
 | 1460 | +        result = pd.merge(left, right, on='X')  | 
 | 1461 | +        tm.assert_index_equal(result.X.cat.categories,  | 
 | 1462 | +                              pd.Index(['bar', 'foo']))  | 
 | 1463 | + | 
 | 1464 | +        assert_frame_equal(result, merged)  | 
 | 1465 | + | 
 | 1466 | +        result = result.dtypes.sort_index()  | 
 | 1467 | +        expected = Series([CategoricalDtype(),  | 
 | 1468 | +                           np.dtype('O'),  | 
 | 1469 | +                           np.dtype('O')],  | 
 | 1470 | +                          index=['X', 'Y', 'Z'])  | 
 | 1471 | +        assert_series_equal(result, expected)  | 
 | 1472 | + | 
 | 1473 | +        # swap the categories and ordered on one  | 
 | 1474 | +        # but should still work (end return categorical)  | 
 | 1475 | +        right = r.assign(X=r.X.astype('category', categories=['foo', 'bar'],  | 
 | 1476 | +                                      ordered=True))  | 
 | 1477 | +        result = pd.merge(left, right, on='X')  | 
 | 1478 | +        tm.assert_index_equal(result.X.cat.categories,  | 
 | 1479 | +                              pd.Index(['bar', 'foo']))  | 
 | 1480 | + | 
 | 1481 | +        assert_frame_equal(result, merged)  | 
 | 1482 | + | 
 | 1483 | +        result = result.dtypes.sort_index()  | 
 | 1484 | +        expected = Series([CategoricalDtype(),  | 
 | 1485 | +                           np.dtype('O'),  | 
 | 1486 | +                           np.dtype('O')],  | 
 | 1487 | +                          index=['X', 'Y', 'Z'])  | 
 | 1488 | +        assert_series_equal(result, expected)  | 
 | 1489 | + | 
 | 1490 | + | 
1375 | 1491 | if __name__ == '__main__':  | 
1376 | 1492 |     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],  | 
1377 | 1493 |                    exit=False)  | 
0 commit comments