Abracadabra

Python data analysis Learning note Ch07

数据规整化:清理、转换、合并、重塑

1
2
3
4
5
6
7
8
9
10
11
12
13
14
from __future__ import division
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
from pandas import Series, DataFrame
import pandas
import pandas as pd
np.set_printoptions(precision=4, threshold=500)
pd.options.display.max_rows = 100
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
1
%matplotlib inline

合并数据集

数据库风格的DataFrame合并

1
2
3
4
5
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1': range(7)})
df2 = DataFrame({'key': ['a', 'b', 'd'],
'data2': range(3)})
df1


















































data1key
00b
11b
22a
33c
44a
55a
66b

1
df2






























data2key
00a
11b
22d

默认情况下根据重叠的列名进行合并

1
pd.merge(df1, df2)




















































data1keydata2
00b1
11b1
26b1
32a0
44a0
55a0

最好进行显式地指定

1
pd.merge(df1, df2, on='key')




















































data1keydata2
00b1
11b1
26b1
32a0
44a0
55a0

1
2
3
4
df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1': range(7)})
df4 = DataFrame({'rkey': ['a', 'b', 'd'],
'data2': range(3)})
1
df3


















































data1lkey
00b
11b
22a
33c
44a
55a
66b

1
df4






























data2rkey
00a
11b
22d

如果两个对象的列名不同,那么就需要分别指定

1
pd.merge(df3, df4, left_on='lkey', right_on='rkey')



























































data1lkeydata2rkey
00b1b
11b1b
26b1b
32a0a
44a0a
55a0a

默认是进行inner连接(交集), outer是求取并集

1
pd.merge(df1, df2, how='outer')
































































data1keydata2
00.0b1.0
11.0b1.0
26.0b1.0
32.0a0.0
44.0a0.0
55.0a0.0
63.0cNaN
7NaNd2.0

1
2
3
4
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
'data1': range(6)})
df2 = DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
'data2': range(5)})
1
df1













































data1key
00b
11b
22a
33c
44a
55b

1
df2








































data2key
00a
11b
22a
33b
44d

1
pd.merge(df1, df2, on='key', how='left')


















































































data1keydata2
00b1.0
10b3.0
21b1.0
31b3.0
42a0.0
52a2.0
63cNaN
74a0.0
84a2.0
95b1.0
105b3.0

1
pd.merge(df1, df2, how='inner')












































































data1keydata2
00b1
10b3
21b1
31b3
45b1
55b3
62a0
72a2
84a0
94a2

1
2
3
4
5
6
left = DataFrame({'key1': ['foo', 'foo', 'bar'],
'key2': ['one', 'two', 'one'],
'lval': [1, 2, 3]})
right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
'key2': ['one', 'one', 'one', 'two'],
'rval': [4, 5, 6, 7]})
1
2
1
left


































key1key2lval
0fooone1
1footwo2
2barone3

1
right








































key1key2rval
0fooone4
1fooone5
2barone6
3bartwo7

1
pd.merge(left, right, on=['key1', 'key2'], how='outer')




















































key1key2lvalrval
0fooone1.04.0
1fooone1.05.0
2footwo2.0NaN
3barone3.06.0
4bartwoNaN7.0

列名重复问题

1
pd.merge(left, right, on='key1')


































































key1key2_xlvalkey2_yrval
0fooone1one4
1fooone1one5
2footwo2one4
3footwo2one5
4barone3one6
5barone3two7

1
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))


































































key1key2_leftlvalkey2_rightrval
0fooone1one4
1fooone1one5
2footwo2one4
3footwo2one5
4barone3one6
5barone3two7

索引上的合并

1
2
3
left1 = DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
'value': range(6)})
right1 = DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
1
left1













































keyvalue
0a0
1b1
2a2
3a3
4b4
5c5

1
right1






















group_val
a3.5
b7.0

1
pd.merge(left1, right1, left_on='key', right_index=True)














































keyvaluegroup_val
0a03.5
2a23.5
3a33.5
1b17.0
4b47.0

1
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')




















































keyvaluegroup_val
0a03.5
2a23.5
3a33.5
1b17.0
4b47.0
5c5NaN

1
2
3
4
5
6
7
8
lefth = DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'key2': [2000, 2001, 2002, 2001, 2002],
'data': np.arange(5.)})
righth = DataFrame(np.arange(12).reshape((6, 2)),
index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
[2001, 2000, 2000, 2000, 2001, 2002]],
columns=['event1', 'event2'])
lefth














































datakey1key2
00.0Ohio2000
11.0Ohio2001
22.0Ohio2002
33.0Nevada2001
44.0Nevada2002

1
righth
















































event1event2
Nevada200101
200023
Ohio200045
200067
200189
20021011

1
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)


























































datakey1key2event1event2
00.0Ohio200045
00.0Ohio200067
11.0Ohio200189
22.0Ohio20021011
33.0Nevada200101

1
2
pd.merge(lefth, righth, left_on=['key1', 'key2'],
right_index=True, how='outer')










































































datakey1key2event1event2
00.0Ohio2000.04.05.0
00.0Ohio2000.06.07.0
11.0Ohio2001.08.09.0
22.0Ohio2002.010.011.0
33.0Nevada2001.00.01.0
44.0Nevada2002.0NaNNaN
4NaNNevada2000.02.03.0

1
2
3
4
left2 = DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'],
columns=['Ohio', 'Nevada'])
right2 = DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])
1
left2






























OhioNevada
a1.02.0
c3.04.0
e5.06.0

1
right2



































MissouriAlabama
b7.08.0
c9.010.0
d11.012.0
e13.014.0

1
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)




















































OhioNevadaMissouriAlabama
a1.02.0NaNNaN
bNaNNaN7.08.0
c3.04.09.010.0
dNaNNaN11.012.0
e5.06.013.014.0

1
left2.join(right2, how='outer')




















































OhioNevadaMissouriAlabama
a1.02.0NaNNaN
bNaNNaN7.08.0
c3.04.09.010.0
dNaNNaN11.012.0
e5.06.013.014.0

1
left1.join(right1, on='key')




















































keyvaluegroup_val
0a03.5
1b17.0
2a23.5
3a33.5
4b47.0
5c5NaN

1
2
another = DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])



































New YorkOregon
a7.08.0
c9.010.0
e11.012.0
f16.017.0

相当于三个表进行合并

1
2
3
4
left2
right2
another
left2.join([right2, another])






























OhioNevada
a1.02.0
c3.04.0
e5.06.0




































MissouriAlabama
b7.08.0
c9.010.0
d11.012.0
e13.014.0




































New YorkOregon
a7.08.0
c9.010.0
e11.012.0
f16.017.0















































OhioNevadaMissouriAlabamaNew YorkOregon
a1.02.0NaNNaN7.08.0
c3.04.09.010.09.010.0
e5.06.013.014.011.012.0

1
left2.join([right2, another], how='outer')

轴向连接

之前指的都是行级别的连接操作

1
arr = np.arange(12).reshape((3, 4))
1
arr
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
1
np.concatenate([arr, arr], axis=1)
array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])
1
2
3
s1 = Series([0, 1], index=['a', 'b'])
s2 = Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = Series([5, 6], index=['f', 'g'])
1
pd.concat([s1, s2, s3])
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
1
pd.concat([s1, s2, s3], axis=1)


























































012
a0.0NaNNaN
b1.0NaNNaN
cNaN2.0NaN
dNaN3.0NaN
eNaN4.0NaN
fNaNNaN5.0
gNaNNaN6.0

1
2
s4 = pd.concat([s1 * 5, s3])
s4
a    0
b    5
f    5
g    6
dtype: int64
1
pd.concat([s1, s4], axis=1)



































01
a0.00
b1.05
fNaN5
gNaN6

1
pd.concat([s1, s4], axis=1, join='inner')

























01
a00
b15

1
pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])



































01
a0.00.0
cNaNNaN
b1.05.0
eNaNNaN

在连接轴上建立一个层次化索引

1
2
3
s1
s3
result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
a    0
b    1
dtype: int64






f    5
g    6
dtype: int64
1
result
one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64
1
2
# Much more on the unstack function later
result.unstack()






































abfg
one0.01.0NaNNaN
two0.01.0NaNNaN
threeNaNNaN5.06.0

1
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])


























































onetwothree
a0.0NaNNaN
b1.0NaNNaN
cNaN2.0NaN
dNaN3.0NaN
eNaN4.0NaN
fNaNNaN5.0
gNaNNaN6.0

1
2
3
4
5
6
7
8
df1 = DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
columns=['one', 'two'])
df2 = DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
columns=['three', 'four'])
df1
df2
pd.concat([df1, df2], keys=['level1', 'level2'])
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])






























onetwo
a01
b23
c45


























threefour
a56
c78
























































fouronethreetwo
level1aNaN0.0NaN1.0
bNaN2.0NaN3.0
cNaN4.0NaN5.0
level2a6.0NaN5.0NaN
c8.0NaN7.0NaN












































level1level2
onetwothreefour
a015.06.0
b23NaNNaN
c457.08.0

1
2
pd.concat({'level1': df1, 'level2': df2}, axis=0)
pd.concat({'level1': df1, 'level2': df2}, axis=1)























































fouronethreetwo
level1aNaN0.0NaN1.0
bNaN2.0NaN3.0
cNaN4.0NaN5.0
level2a6.0NaN5.0NaN
c8.0NaN7.0NaN












































level1level2
onetwothreefour
a015.06.0
b23NaNNaN
c457.08.0

1
2
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],
names=['upper', 'lower'])











































upperlevel1level2
loweronetwothreefour
a015.06.0
b23NaNNaN
c457.08.0

1
2
df1 = DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
1
df1






































abcd
0-0.2047080.478943-0.519439-0.555730
11.9657811.3934060.0929080.281746
20.7690231.2464351.007189-1.296221

1
df2




























bda
00.2749920.2289131.352917
10.886429-2.001637-0.371843

去除无关的行索引

1
2
pd.concat([df1, df2], ignore_index=False)
pd.concat([df1, df2], ignore_index=True)




















































abcd
0-0.2047080.478943-0.519439-0.555730
11.9657811.3934060.0929080.281746
20.7690231.2464351.007189-1.296221
01.3529170.274992NaN0.228913
1-0.3718430.886429NaN-2.001637





















































abcd
0-0.2047080.478943-0.519439-0.555730
11.9657811.3934060.0929080.281746
20.7690231.2464351.007189-1.296221
31.3529170.274992NaN0.228913
4-0.3718430.886429NaN-2.001637

合并重叠数据

1
2
3
4
5
a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
index=['f', 'e', 'd', 'c', 'b', 'a'])
b = Series(np.arange(len(a), dtype=np.float64),
index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan
1
a
f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64
1
b
f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64
1
np.where(pd.isnull(a), b, a)
array([ 0. ,  2.5,  2. ,  3.5,  4.5,  nan])

combine_first, 重叠值合并,且进行数据对其

1
b[:-2].combine_first(a[2:])
a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64
1
2
3
4
5
6
7
8
df1 = DataFrame({'a': [1., np.nan, 5., np.nan],
'b': [np.nan, 2., np.nan, 6.],
'c': range(2, 18, 4)})
df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.],
'b': [np.nan, 3., 4., 6., 8.]})
df1
df2
df1.combine_first(df2)








































abc
01.0NaN2
1NaN2.06
25.0NaN10
3NaN6.014









































ab
05.0NaN
14.03.0
2NaN4.0
33.06.0
47.08.0















































abc
01.0NaN2.0
14.02.06.0
25.04.010.0
33.06.014.0
47.08.0NaN

重塑和轴向旋转

重塑层次化索引

1
2
3
4
data = DataFrame(np.arange(6).reshape((2, 3)),
index=pd.Index(['Ohio', 'Colorado'], name='state'),
columns=pd.Index(['one', 'two', 'three'], name='number'))
data


































numberonetwothree
state
Ohio012
Colorado345

stack将列旋转为行

1
2
result = data.stack()
result
state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

unstack将行旋转为列,默认操作最内层

1
result.unstack()


































numberonetwothree
state
Ohio012
Colorado345

1
result.unstack(0)



































stateOhioColorado
number
one03
two14
three25

1
result.unstack('state')



































stateOhioColorado
number
one03
two14
three25

1
2
3
4
5
6
7
s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = Series([4, 5, 6], index=['c', 'd', 'e'])
s1
s2
data2 = pd.concat([s1, s2], keys=['one', 'two'])
data2
data2.unstack()
a    0
b    1
c    2
d    3
dtype: int64






c    4
d    5
e    6
dtype: int64






one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64


































abcde
one0.01.02.03.0NaN
twoNaNNaN4.05.06.0

stack默认会滤除缺失值,因此两者可逆

1
data2.unstack().stack()
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64
1
data2.unstack().stack(dropna=False)
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64
1
2
3
4
result
df = DataFrame({'left': result, 'right': result + 5},
columns=pd.Index(['left', 'right'], name='side'))
df
state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32






















































sideleftright
statenumber
Ohioone05
two16
three27
Coloradoone38
two49
three510

DataFrame作为旋转轴的级别将成为结果中的最低级别(axis=MAX)

1
df.unstack('state')


















































sideleftright
stateOhioColoradoOhioColorado
number
one0358
two1469
three25710

stack操作将axis-1?

1
df.unstack('state').stack('side')























































stateOhioColorado
numberside
oneleft03
right58
twoleft14
right69
threeleft25
right710

1
2
df.unstack('state')
df.unstack('state').stack('state')


















































sideleftright
stateOhioColoradoOhioColorado
number
one0358
two1469
three25710
























































sideleftright
numberstate
oneOhio05
Colorado38
twoOhio16
Colorado49
threeOhio27
Colorado510

将长格式旋转为宽格式

1
2
3
4
5
6
7
8
9
10
11
12
13
data = pd.read_csv('ch07/macrodata.csv')
data[:10]
periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
periods[:10]
data = DataFrame(data.to_records(),
columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'),
index=periods.to_timestamp('D', 'end'))
data[:10]
data.to_records()[:10]
data.stack()[:10]
ldata = data.stack().reset_index().rename(columns={0: 'value'})
wdata = ldata.pivot('date', 'item', 'value')





































































































































































































yearquarterrealgdprealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealint
01959.01.02710.3491707.4286.898470.0451886.928.98139.72.825.8177.1460.000.00
11959.02.02778.8011733.7310.859481.3011919.729.15141.73.085.1177.8302.340.74
21959.03.02775.4881751.8289.226491.2601916.429.35140.53.825.3178.6572.741.09
31959.04.02785.2041753.7299.356484.0521931.329.37140.04.335.6179.3860.274.06
41960.01.02847.6991770.5331.722462.1991955.529.54139.63.505.2180.0072.311.19
51960.02.02834.3901792.9298.152460.4001966.129.55140.22.685.2180.6710.142.55
61960.03.02839.0221785.8296.375474.6761967.829.75140.92.365.6181.5282.70-0.34
71960.04.02802.6161788.2259.764476.4341966.629.84141.12.296.3182.2871.211.08
81961.01.02819.2641787.7266.405475.8541984.529.81142.12.376.8182.992-0.402.77
91961.02.02872.0051814.3286.246480.3282014.429.92142.92.297.0183.6911.470.81

PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2'],
            dtype='int64', name='date', freq='Q-DEC')


















































































itemrealgdpinflunemp
date
1959-03-312710.3490.005.8
1959-06-302778.8012.345.1
1959-09-302775.4882.745.3
1959-12-312785.2040.275.6
1960-03-312847.6992.315.2
1960-06-302834.3900.145.2
1960-09-302839.0222.705.6
1960-12-312802.6161.216.3
1961-03-312819.264-0.406.8
1961-06-302872.0051.477.0

rec.array([(datetime.datetime(1959, 3, 31, 0, 0), 2710.349, 0.0, 5.8),
 (datetime.datetime(1959, 6, 30, 0, 0), 2778.801, 2.34, 5.1),
 (datetime.datetime(1959, 9, 30, 0, 0), 2775.488, 2.74, 5.3),
 (datetime.datetime(1959, 12, 31, 0, 0), 2785.204, 0.27, 5.6),
 (datetime.datetime(1960, 3, 31, 0, 0), 2847.699, 2.31, 5.2),
 (datetime.datetime(1960, 6, 30, 0, 0), 2834.39, 0.14, 5.2),
 (datetime.datetime(1960, 9, 30, 0, 0), 2839.022, 2.7, 5.6),
 (datetime.datetime(1960, 12, 31, 0, 0), 2802.616, 1.21, 6.3),
 (datetime.datetime(1961, 3, 31, 0, 0), 2819.264, -0.4, 6.8),
 (datetime.datetime(1961, 6, 30, 0, 0), 2872.005, 1.47, 7.0)], 
          dtype=[('date', 'O'), ('realgdp', '<f8'), ('infl', '<f8'), ('unemp', '<f8')])






date        item   
1959-03-31  realgdp    2710.349
            infl          0.000
            unemp         5.800
1959-06-30  realgdp    2778.801
            infl          2.340
            unemp         5.100
1959-09-30  realgdp    2775.488
            infl          2.740
            unemp         5.300
1959-12-31  realgdp    2785.204
dtype: float64
1
ldata[:10]












































































dateitemvalue
01959-03-31realgdp2710.349
11959-03-31infl0.000
21959-03-31unemp5.800
31959-06-30realgdp2778.801
41959-06-30infl2.340
51959-06-30unemp5.100
61959-09-30realgdp2775.488
71959-09-30infl2.740
81959-09-30unemp5.300
91959-12-31realgdp2785.204

1
2
pivoted = ldata.pivot('date', 'item', 'value')
pivoted.head()




















































iteminflrealgdpunemp
date
1959-03-310.002710.3495.8
1959-06-302.342778.8015.1
1959-09-302.742775.4885.3
1959-12-310.272785.2045.6
1960-03-312.312847.6995.2

1
2
ldata['value2'] = np.random.randn(len(ldata))
ldata[:10]























































































dateitemvaluevalue2
01959-03-31realgdp2710.349-0.204708
11959-03-31infl0.0000.478943
21959-03-31unemp5.800-0.519439
31959-06-30realgdp2778.801-0.555730
41959-06-30infl2.3401.965781
51959-06-30unemp5.1001.393406
61959-09-30realgdp2775.4880.092908
71959-09-30infl2.7400.281746
81959-09-30unemp5.3000.769023
91959-12-31realgdp2785.2041.246435

1
2
pivoted = ldata.pivot('date', 'item')
pivoted[:5]














































































valuevalue2
iteminflrealgdpunempinflrealgdpunemp
date
1959-03-310.002710.3495.80.478943-0.204708-0.519439
1959-06-302.342778.8015.11.965781-0.5557301.393406
1959-09-302.742775.4885.30.2817460.0929080.769023
1959-12-310.272785.2045.61.0071891.246435-1.296221
1960-03-312.312847.6995.20.2289130.2749921.352917

1
pivoted['value'][:5]




















































iteminflrealgdpunemp
date
1959-03-310.002710.3495.8
1959-06-302.342778.8015.1
1959-09-302.742775.4885.3
1959-12-310.272785.2045.6
1960-03-312.312847.6995.2

1
2
3
ldata.set_index(['date', 'item'])[:9]
unstacked = ldata.set_index(['date', 'item']).unstack('item')
unstacked[:7]






































































valuevalue2
dateitem
1959-03-31realgdp2710.349-0.204708
infl0.0000.478943
unemp5.800-0.519439
1959-06-30realgdp2778.801-0.555730
infl2.3401.965781
unemp5.1001.393406
1959-09-30realgdp2775.4880.092908
infl2.7400.281746
unemp5.3000.769023

































































































valuevalue2
iteminflrealgdpunempinflrealgdpunemp
date
1959-03-310.002710.3495.80.478943-0.204708-0.519439
1959-06-302.342778.8015.11.965781-0.5557301.393406
1959-09-302.742775.4885.30.2817460.0929080.769023
1959-12-310.272785.2045.61.0071891.246435-1.296221
1960-03-312.312847.6995.20.2289130.2749921.352917
1960-06-300.142834.3905.2-2.0016370.886429-0.371843
1960-09-302.702839.0225.6-0.4385701.669025-0.539741

数据转换

移除重复值

1
2
3
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
'k2': [1, 1, 2, 3, 3, 4, 4]})
data


















































k1k2
0one1
1one1
2one2
3two3
4two3
5two4
6two4

1
data.duplicated()
0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool
1
data.drop_duplicates()



































k1k2
0one1
2one2
3two3
5two4

1
2
3
data['v1'] = range(7)
data
data.drop_duplicates(['k1'])


























































k1k2v1
0one10
1one11
2one22
3two33
4two34
5two45
6two46





























k1k2v1
0one10
3two33

1
2
data.drop_duplicates(['k1', 'k2'], keep='last')
data.drop_duplicates(['k1', 'k2'], keep='first')








































k1k2v1
1one11
2one22
4two34
6two46









































k1k2v1
0one10
2one22
3two33
5two45

利用函数或映射进行数据转换

1
2
3
4
5
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
'corned beef', 'Bacon', 'pastrami', 'honey ham',
'nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data




























































foodounces
0bacon4.0
1pulled pork3.0
2bacon12.0
3Pastrami6.0
4corned beef7.5
5Bacon8.0
6pastrami3.0
7honey ham5.0
8nova lox6.0

1
2
3
4
5
6
7
8
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}
1
2
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data






































































foodouncesanimal
0bacon4.0pig
1pulled pork3.0pig
2bacon12.0pig
3Pastrami6.0cow
4corned beef7.5cow
5Bacon8.0pig
6pastrami3.0cow
7honey ham5.0pig
8nova lox6.0salmon

1
data['food'].map(lambda x: meat_to_animal[x.lower()])
0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

替换值

1
2
data = Series([1., -999., 2., -999., -1000., 3.])
data
0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64
1
data.replace(-999, np.nan)
0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64
1
data.replace([-999, -1000], np.nan)
0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64
1
data.replace([-999, -1000], [np.nan, 0])
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64
1
data.replace({-999: np.nan, -1000: 0})
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

重命名轴索引

1
2
3
4
data = DataFrame(np.arange(12).reshape((3, 4)),
index=['Ohio', 'Colorado', 'New York'],
columns=['one', 'two', 'three', 'four'])
data






































onetwothreefour
Ohio0123
Colorado4567
New York891011

1
data.index.map(str.upper)
array(['OHIO', 'COLORADO', 'NEW YORK'], dtype=object)
1
2
data.index = data.index.map(str.upper)
data






































onetwothreefour
OHIO0123
COLORADO4567
NEW YORK891011

1
data.rename(index=str.title, columns=str.upper)






































ONETWOTHREEFOUR
Ohio0123
Colorado4567
New York891011

1
2
data.rename(index={'OHIO': 'INDIANA'},
columns={'three': 'peekaboo'})






































onetwopeekaboofour
INDIANA0123
COLORADO4567
NEW YORK891011

1
2
3
# Always returns a reference to a DataFrame
_ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data






































onetwothreefour
INDIANA0123
COLORADO4567
NEW YORK891011

离散化和面元划分

1
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
1
2
3
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
1
cats.codes
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
1
cats.categories
Index(['(18, 25]', '(25, 35]', '(35, 60]', '(60, 100]'], dtype='object')
1
pd.value_counts(cats)
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64
1
pd.cut(ages, [18, 26, 36, 61, 100], right=False)
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, object): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
1
2
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)
[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]
1
2
data = np.random.rand(20)
pd.cut(data, 4, precision=2)
[(0.25, 0.49], (0.25, 0.49], (0.73, 0.98], (0.25, 0.49], (0.25, 0.49], ..., (0.25, 0.49], (0.73, 0.98], (0.49, 0.73], (0.49, 0.73], (0.49, 0.73]]
Length: 20
Categories (4, object): [(0.0032, 0.25] < (0.25, 0.49] < (0.49, 0.73] < (0.73, 0.98]]
1
2
3
data = np.random.randn(1000) # Normally distributed
cats = pd.qcut(data, 4) # Cut into quartiles
cats
[(0.636, 3.26], [-3.745, -0.648], (0.636, 3.26], (-0.022, 0.636], (-0.648, -0.022], ..., (0.636, 3.26], (-0.022, 0.636], [-3.745, -0.648], (-0.022, 0.636], (-0.022, 0.636]]
Length: 1000
Categories (4, object): [[-3.745, -0.648] < (-0.648, -0.022] < (-0.022, 0.636] < (0.636, 3.26]]
1
pd.value_counts(cats)
(0.636, 3.26]       250
(-0.022, 0.636]     250
(-0.648, -0.022]    250
[-3.745, -0.648]    250
dtype: int64
1
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
[(-0.022, 1.298], [-3.745, -1.274], (-0.022, 1.298], (-0.022, 1.298], (-1.274, -0.022], ..., (-0.022, 1.298], (-0.022, 1.298], [-3.745, -1.274], (-0.022, 1.298], (-0.022, 1.298]]
Length: 1000
Categories (4, object): [[-3.745, -1.274] < (-1.274, -0.022] < (-0.022, 1.298] < (1.298, 3.26]]

检测和过滤异常值

1
2
3
np.random.seed(12345)
data = DataFrame(np.random.randn(1000, 4))
data.describe()









































































0123
count1000.0000001000.0000001000.0000001000.000000
mean-0.0676840.0679240.025598-0.002298
std0.9980350.9921061.0068350.996794
min-3.428254-3.548824-3.184377-3.745356
25%-0.774890-0.591841-0.641675-0.644144
50%-0.1164010.1011430.002073-0.013611
75%0.6163660.7802820.6803910.654328
max3.3666262.6536563.2603833.927528

1
2
col = data[3]
col[np.abs(col) > 3]
97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64
1
data[(np.abs(data) > 3).any(1)]






























































































0123
5-0.5397410.4769853.248944-1.021228
97-0.7743630.5529360.1060613.927528
102-0.655054-0.5652303.1768730.959533
305-2.3155550.457246-0.025907-3.399312
3240.0501881.9513123.2603830.963301
4000.1463260.508391-0.196713-3.745356
499-0.293333-0.242459-3.0569901.918403
523-3.428254-0.296336-0.439938-0.867165
5860.2751441.179227-3.1843771.369891
808-0.362528-3.5488241.553205-2.186301
9003.366626-2.3722140.8510101.332846

1
2
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()









































































0123
count1000.0000001000.0000001000.0000001000.000000
mean-0.0676230.0684730.025153-0.002081
std0.9954850.9902531.0039770.989736
min-3.000000-3.000000-3.000000-3.000000
25%-0.774890-0.591841-0.641675-0.644144
50%-0.1164010.1011430.002073-0.013611
75%0.6163660.7802820.6803910.654328
max3.0000002.6536563.0000003.000000

排列和随机采样

1
2
3
df = DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler
array([1, 0, 2, 3, 4])
1
df




















































0123
00123
14567
2891011
312131415
416171819

1
df.take(sampler)




















































0123
14567
00123
2891011
312131415
416171819

1
df.take(np.random.permutation(len(df))[:3])






































0123
14567
00123
416171819

1
2
bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size=10)
1
sampler
array([3, 0, 4, 1, 1, 2, 3, 0, 1, 2])
1
2
draws = bag.take(sampler)
draws
array([ 6,  5,  4,  7,  7, -1,  6,  5,  7, -1])

计算指标 / 哑变量

1
2
3
4
df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
'data1': range(6)})
df
pd.get_dummies(df['key'])













































data1key
00b
11b
22a
33c
44a
55b





















































abc
00.01.00.0
10.01.00.0
21.00.00.0
30.00.01.0
41.00.00.0
50.01.00.0

1
2
3
4
dummies = pd.get_dummies(df['key'], prefix='key')
dummies
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy




















































key_akey_bkey_c
00.01.00.0
10.01.00.0
21.00.00.0
30.00.01.0
41.00.00.0
50.01.00.0




























































data1key_akey_bkey_c
000.01.00.0
110.01.00.0
221.00.00.0
330.00.01.0
441.00.00.0
550.01.00.0

1
2
3
4
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('ch02/movielens/movies.dat', sep='::', header=None,
names=mnames)
movies[:10]
C:\Users\Ewan\Anaconda3\lib\site-packages\ipykernel\__main__.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  app.launch_new_instance()












































































movie_idtitlegenres
01Toy Story (1995)Animation|Children’s|Comedy
12Jumanji (1995)Adventure|Children’s|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
34Waiting to Exhale (1995)Comedy|Drama
45Father of the Bride Part II (1995)Comedy
56Heat (1995)Action|Crime|Thriller
67Sabrina (1995)Comedy|Romance
78Tom and Huck (1995)Adventure|Children’s
89Sudden Death (1995)Action
910GoldenEye (1995)Action|Adventure|Thriller

1
2
3
genre_iter = (set(x.split('|')) for x in movies.genres)
genres = sorted(set.union(*genre_iter))
genres
['Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']
1
2
dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)
dummies[:10].ix[:, :5]


































































































ActionAdventureAnimationChildren’sComedy
00.00.00.00.00.0
10.00.00.00.00.0
20.00.00.00.00.0
30.00.00.00.00.0
40.00.00.00.00.0
50.00.00.00.00.0
60.00.00.00.00.0
70.00.00.00.00.0
80.00.00.00.00.0
90.00.00.00.00.0

1
2
3
for i, gen in enumerate(movies.genres):
dummies.ix[i, gen.split('|')] = 1
dummies[:10].ix[:, :5]


































































































ActionAdventureAnimationChildren’sComedy
00.00.01.01.01.0
10.01.00.01.00.0
20.00.00.00.01.0
30.00.00.00.01.0
40.00.00.00.01.0
51.00.00.00.00.0
60.00.00.00.01.0
70.01.00.01.00.0
81.00.00.00.00.0
91.01.00.00.00.0

1
2
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.ix[0]
movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Action                                   0
Genre_Adventure                                0
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Crime                                    0
Genre_Documentary                              0
Genre_Drama                                    0
Genre_Fantasy                                  0
Genre_Film-Noir                                0
Genre_Horror                                   0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Romance                                  0
Genre_Sci-Fi                                   0
Genre_Thriller                                 0
Genre_War                                      0
Genre_Western                                  0
Name: 0, dtype: object
1
np.random.seed(12345)
1
2
values = np.random.rand(10)
values
array([ 0.9296,  0.3164,  0.1839,  0.2046,  0.5677,  0.5955,  0.9645,
        0.6532,  0.7489,  0.6536])
1
2
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))


































































































(0, 0.2](0.2, 0.4](0.4, 0.6](0.6, 0.8](0.8, 1]
00.00.00.00.01.0
10.01.00.00.00.0
21.00.00.00.00.0
30.01.00.00.00.0
40.00.01.00.00.0
50.00.01.00.00.0
60.00.00.00.01.0
70.00.00.01.00.0
80.00.00.01.00.0
90.00.00.01.00.0

字符串操作

字符串对象方法

1
2
val = 'a,b, guido'
val.split(',')
['a', 'b', '  guido']
1
2
pieces = [x.strip() for x in val.split(',')]
pieces
['a', 'b', 'guido']
1
2
first, second, third = pieces
first + '::' + second + '::' + third
'a::b::guido'

Surprise :P

1
'::'.join(pieces)
'a::b::guido'
1
'guido' in val
True
1
val.index(',')
1
1
val.find(':')
-1
1
val.index(':')
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-110-280f8b2856ce> in <module>()
----> 1 val.index(':')


ValueError: substring not found
1
val.count(',')
2
1
val.replace(',', '::')
'a::b::  guido'
1
val.replace(',', '')
'ab  guido'

正则表达式

1
2
3
import re
text = "foo bar\t baz \tqux"
re.split('\s+', text)
['foo', 'bar', 'baz', 'qux']
1
2
regex = re.compile('\s+')
regex.split(text)
['foo', 'bar', 'baz', 'qux']
1
regex.findall(text)
['    ', '\t ', '  \t']
1
2
3
4
5
6
7
8
9
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)
1
regex.findall(text)
['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

Search只返回第一项

1
2
m = regex.search(text)
m
<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>
1
text[m.start():m.end()]
'dave@google.com'

只匹配出现在字符串开头的模式

1
print(regex.match(text))
None

替换

1
print(regex.sub('REDACTED', text))
Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED

1
2
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)
1
2
m = regex.match('wesm@bright.net')
m.groups()
('wesm', 'bright', 'net')
1
regex.findall(text)
[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]
1
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))
Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com

1
2
3
4
5
6
regex = re.compile(r"""
(?P<username>[A-Z0-9._%+-]+)
@
(?P<domain>[A-Z0-9.-]+)
\.
(?P<suffix>[A-Z]{2,4})""", flags=re.IGNORECASE|re.VERBOSE)
1
2
m = regex.match('wesm@bright.net')
m.groupdict()
{'domain': 'bright', 'suffix': 'net', 'username': 'wesm'}

pandas中矢量化的字符串函数

1
2
3
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = Series(data)
1
data
Dave     dave@google.com
Rob        rob@gmail.com
Steve    steve@gmail.com
Wes                  NaN
dtype: object
1
data.isnull()
Dave     False
Rob      False
Steve    False
Wes       True
dtype: bool
1
data.str.contains('gmail')
Dave     False
Rob       True
Steve     True
Wes        NaN
dtype: object
1
pattern
'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
1
data.str.findall(pattern, flags=re.IGNORECASE)
Dave     [(dave, google, com)]
Rob        [(rob, gmail, com)]
Steve    [(steve, gmail, com)]
Wes                        NaN
dtype: object
1
2
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches
C:\Users\Ewan\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: In future versions of pandas, match will change to always return a bool indexer.
  if __name__ == '__main__':





Dave     (dave, google, com)
Rob        (rob, gmail, com)
Steve    (steve, gmail, com)
Wes                      NaN
dtype: object
1
matches.str.get(1)
Dave     google
Rob       gmail
Steve     gmail
Wes         NaN
dtype: object
1
matches.str[0]
Dave      dave
Rob        rob
Steve    steve
Wes        NaN
dtype: object
1
data.str[:5]
Dave     dave@
Rob      rob@g
Steve    steve
Wes        NaN
dtype: object

Example: USDA Food Database

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
{
"id": 21441,
"description": "KENTUCKY FRIED CHICKEN, Fried Chicken, EXTRA CRISPY,
Wing, meat and skin with breading",
"tags": ["KFC"],
"manufacturer": "Kentucky Fried Chicken",
"group": "Fast Foods",
"portions": [
{
"amount": 1,
"unit": "wing, with skin",
"grams": 68.0
},
...
],
"nutrients": [
{
"value": 20.8,
"units": "g",
"description": "Protein",
"group": "Composition"
},
...
]
}
1
2
3
import json
db = json.load(open('ch07/foods-2011-10-03.json'))
len(db)
6636
1
db[0].keys()
dict_keys(['id', 'tags', 'portions', 'nutrients', 'description', 'group', 'manufacturer'])
1
db[0]['nutrients'][0]
{'description': 'Protein',
 'group': 'Composition',
 'units': 'g',
 'value': 25.18}
1
2
nutrients = DataFrame(db[0]['nutrients'])
nutrients[:7]


































































descriptiongroupunitsvalue
0ProteinCompositiong25.18
1Total lipid (fat)Compositiong29.20
2Carbohydrate, by differenceCompositiong3.06
3AshOtherg3.28
4EnergyEnergykcal376.00
5WaterCompositiong39.28
6EnergyEnergykJ1573.00

1
2
info_keys = ['description', 'group', 'id', 'manufacturer']
info = DataFrame(db, columns=info_keys)
1
info[:5]




















































descriptiongroupidmanufacturer
0Cheese, carawayDairy and Egg Products1008
1Cheese, cheddarDairy and Egg Products1009
2Cheese, edamDairy and Egg Products1018
3Cheese, fetaDairy and Egg Products1019
4Cheese, mozzarella, part skim milkDairy and Egg Products1028

1
pd.value_counts(info.group)[:10]
Vegetables and Vegetable Products    812
Beef Products                        618
Baked Products                       496
Breakfast Cereals                    403
Legumes and Legume Products          365
Fast Foods                           365
Lamb, Veal, and Game Products        345
Sweets                               341
Pork Products                        328
Fruits and Fruit Juices              328
Name: group, dtype: int64
1
2
3
4
5
6
7
8
nutrients = []
for rec in db:
fnuts = DataFrame(rec['nutrients'])
fnuts['id'] = rec['id']
nutrients.append(fnuts)
nutrients = pd.concat(nutrients, ignore_index=True)
1
nutrients[:10]


































































































descriptiongroupunitsvalueid
0ProteinCompositiong25.181008
1Total lipid (fat)Compositiong29.201008
2Carbohydrate, by differenceCompositiong3.061008
3AshOtherg3.281008
4EnergyEnergykcal376.001008
5WaterCompositiong39.281008
6EnergyEnergykJ1573.001008
7Fiber, total dietaryCompositiong0.001008
8Calcium, CaElementsmg673.001008
9Iron, FeElementsmg0.641008

1
nutrients.duplicated().sum()
14179
1
nutrients = nutrients.drop_duplicates()
1
2
3
4
col_mapping = {'description' : 'food',
'group' : 'fgroup'}
info = info.rename(columns=col_mapping, copy=False)
info[:10]























































































foodfgroupidmanufacturer
0Cheese, carawayDairy and Egg Products1008
1Cheese, cheddarDairy and Egg Products1009
2Cheese, edamDairy and Egg Products1018
3Cheese, fetaDairy and Egg Products1019
4Cheese, mozzarella, part skim milkDairy and Egg Products1028
5Cheese, mozzarella, part skim milk, low moistureDairy and Egg Products1029
6Cheese, romanoDairy and Egg Products1038
7Cheese, roquefortDairy and Egg Products1039
8Cheese spread, pasteurized process, american, …Dairy and Egg Products1048
9Cream, fluid, half and halfDairy and Egg Products1049

1
2
3
4
col_mapping = {'description' : 'nutrient',
'group' : 'nutgroup'}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
nutrients[:10]


































































































nutrientnutgroupunitsvalueid
0ProteinCompositiong25.181008
1Total lipid (fat)Compositiong29.201008
2Carbohydrate, by differenceCompositiong3.061008
3AshOtherg3.281008
4EnergyEnergykcal376.001008
5WaterCompositiong39.281008
6EnergyEnergykJ1573.001008
7Fiber, total dietaryCompositiong0.001008
8Calcium, CaElementsmg673.001008
9Iron, FeElementsmg0.641008

1
ndata = pd.merge(nutrients, info, on='id', how='outer')
1
ndata[:10]



































































































































nutrientnutgroupunitsvalueidfoodfgroupmanufacturer
0ProteinCompositiong25.181008Cheese, carawayDairy and Egg Products
1Total lipid (fat)Compositiong29.201008Cheese, carawayDairy and Egg Products
2Carbohydrate, by differenceCompositiong3.061008Cheese, carawayDairy and Egg Products
3AshOtherg3.281008Cheese, carawayDairy and Egg Products
4EnergyEnergykcal376.001008Cheese, carawayDairy and Egg Products
5WaterCompositiong39.281008Cheese, carawayDairy and Egg Products
6EnergyEnergykJ1573.001008Cheese, carawayDairy and Egg Products
7Fiber, total dietaryCompositiong0.001008Cheese, carawayDairy and Egg Products
8Calcium, CaElementsmg673.001008Cheese, carawayDairy and Egg Products
9Iron, FeElementsmg0.641008Cheese, carawayDairy and Egg Products

1
ndata.ix[30000]
nutrient                                       Glycine
nutgroup                                   Amino Acids
units                                                g
value                                             0.04
id                                                6158
food            Soup, tomato bisque, canned, condensed
fgroup                      Soups, Sauces, and Gravies
manufacturer                                          
Name: 30000, dtype: object
1
2
3
result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
result[:10]
result['Zinc, Zn'].sort_values().plot(kind='barh')
nutrient          fgroup                           
Adjusted Protein  Sweets                               12.900
                  Vegetables and Vegetable Products     2.180
Alanine           Baby Foods                            0.085
                  Baked Products                        0.248
                  Beef Products                         1.550
                  Beverages                             0.003
                  Breakfast Cereals                     0.311
                  Cereal Grains and Pasta               0.373
                  Dairy and Egg Products                0.271
                  Ethnic Foods                          1.290
Name: value, dtype: float64






<matplotlib.axes._subplots.AxesSubplot at 0x1ba08c9e780>

result

1
2
3
4
5
6
7
8
9
by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])
get_maximum = lambda x: x.xs(x.value.idxmax())
get_minimum = lambda x: x.xs(x.value.idxmin())
max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]
# make the food a little smaller
max_foods.food = max_foods.food.str[:50]
1
max_foods.ix['Amino Acids']['food']
nutrient
Alanine                           Gelatins, dry powder, unsweetened
Arginine                               Seeds, sesame flour, low-fat
Aspartic acid                                   Soy protein isolate
Cystine                Seeds, cottonseed flour, low fat (glandless)
Glutamic acid                                   Soy protein isolate
Glycine                           Gelatins, dry powder, unsweetened
Histidine                Whale, beluga, meat, dried (Alaska Native)
Hydroxyproline    KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA...
Isoleucine        Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Leucine           Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Lysine            Seal, bearded (Oogruk), meat, dried (Alaska Na...
Methionine                    Fish, cod, Atlantic, dried and salted
Phenylalanine     Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Proline                           Gelatins, dry powder, unsweetened
Serine            Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Threonine         Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Tryptophan         Sea lion, Steller, meat with fat (Alaska Native)
Tyrosine          Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Valine            Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Name: food, dtype: object