Abracadabra

Python data analysis Learning note Ch06

Data loading, storage, and file formats

1
2
3
4
5
6
7
8
9
10
11
12
13
from __future__ import division
from numpy.random import randn
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
from pandas import Series, DataFrame
import pandas as pd
np.set_printoptions(precision=4)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
1
%pwd
'C:\\Users\\Ewan\\Downloads\\pydata-book-master'

Reading and Writing Data in Text Format

1
!more ch06\ex1.csv
a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
1
2
df = pd.read_csv('ch06/ex1.csv')
df










































abcdmessage
01234hello
15678world
29101112foo

1
pd.read_table('ch06/ex1.csv', sep=',')










































abcdmessage
01234hello
15678world
29101112foo

1
!more ch06\ex2.csv
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
1
2
pd.read_csv('ch06/ex2.csv', header=None)
pd.read_csv('ch06/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])










































01234
01234hello
15678world
29101112foo











































abcdmessage
01234hello
15678world
29101112foo

1
2
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('ch06/ex2.csv', names=names, index_col='message')













































abcd
message
hello1234
world5678
foo9101112

1
2
3
!more ch06\csv_mindex.csv
parsed = pd.read_csv('ch06/csv_mindex.csv', index_col=['key1', 'key2'])
parsed
key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16
































































value1value2
key1key2
onea12
b34
c56
d78
twoa910
b1112
c1314
d1516

1
list(open('ch06/ex3.txt'))
['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']

采用正则表达式作为分隔符

1
2
result = pd.read_table('ch06/ex3.txt', sep='\s+')
result








































ABC
aaa-0.264438-1.026059-0.619500
bbb0.9272720.302904-0.032399
ccc-0.264273-0.386314-0.217601
ddd-0.871858-0.3483821.100491

1
2
!more ch06\ex4.csv
pd.read_csv('ch06/ex4.csv', skiprows=[0, 2, 3])
# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo










































abcdmessage
01234hello
15678world
29101112foo

1
2
3
4
!more ch06\ex5.csv
result = pd.read_csv('ch06/ex5.csv')
result
pd.isnull(result)
something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo














































somethingabcdmessage
0one123.04NaN
1two56NaN8world
2three91011.012foo















































somethingabcdmessage
0FalseFalseFalseFalseFalseTrue
1FalseFalseFalseTrueFalseFalse
2FalseFalseFalseFalseFalseFalse

1
2
result = pd.read_csv('ch06/ex5.csv', na_values=['NULL'])
result














































somethingabcdmessage
0one123.04NaN
1two56NaN8world
2three91011.012foo

1
2
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('ch06/ex5.csv', na_values=sentinels)














































somethingabcdmessage
0one123.04NaN
1NaN56NaN8world
2three91011.012NaN

逐块读取文本文件

1
2
result = pd.read_csv('ch06/ex6.csv')
result


























































































































































































































































































































































































































































































































onetwothreefourkey
00.467976-0.038649-0.295344-1.824726L
1-0.3588931.4044530.704965-0.200638B
2-0.5018400.659254-0.421691-0.057688G
30.2048861.0741341.388361-0.982404R
40.354628-0.1331160.283763-0.837063Q
51.8174800.7422730.419395-2.251035Q
6-0.7767640.935518-0.332872-1.875641U
7-0.9131351.530624-0.5726570.477252K
80.358480-0.497572-0.3670160.507702S
9-1.740877-1.160417-1.6378302.172201G
100.240564-0.3282491.2521551.0727968
110.7640181.165476-0.6395441.495258R
120.571035-0.3105370.582437-0.2987651
132.3176580.430710-1.3342160.199679P
141.547771-1.119753-2.2776340.329586J
15-1.3106080.401719-1.0009871.156708E
16-0.0884960.6347120.1533240.415335B
17-0.018663-0.247487-1.4465220.750938A
18-0.070127-1.5790970.1208920.671432F
19-0.194678-0.4920392.3596050.319810H
20-0.2486180.868707-0.492226-0.717959W
21-1.091549-0.867110-0.647760-0.832562C
220.641404-0.138822-0.621963-0.284839C
231.2164080.9926870.165162-0.069619V
24-0.5644740.7928320.7470530.571675I
251.759879-0.515666-0.2304811.362317S
260.1262660.3092810.382820-0.239199L
271.334360-0.100152-0.840731-0.6439676
28-0.7376200.278087-0.053235-0.950972J
29-1.148486-0.986292-0.1449630.124362Y
99700.633495-0.1865240.9276270.1431644
99710.308636-0.1128570.762842-1.0729771
9972-1.627051-0.9781510.154745-1.229037Z
99730.3148470.0979890.1996080.955193P
99741.6669070.9920050.496128-0.686391S
99750.0106030.708540-1.2587110.226541K
99760.118693-0.714455-0.501342-0.254764K
99770.302616-2.011527-0.6280850.768827H
9978-0.0985721.769086-0.215027-0.053076A
9979-0.0190581.9649940.738538-0.883776F
9980-0.5953490.001781-1.423355-1.458477M
99811.392170-1.396560-1.425306-0.847535H
9982-0.896029-0.1522871.9244830.3651846
9983-2.274642-0.9018741.5003520.996541N
9984-0.3018981.0199061.1021602.624526I
9985-2.548389-0.5853741.496201-0.718815D
9986-0.0645880.759292-1.568415-0.420933E
9987-0.143365-1.111760-1.8155810.4352742
9988-0.070412-1.0559210.338017-0.440763X
99890.6491480.994273-1.3842270.485120Q
9990-0.3707690.404356-1.051628-1.0508998
9991-0.4099800.155627-0.8189901.277350W
99920.301214-1.1112030.6682580.671922A
99931.8211170.4164450.1738740.505118X
99940.0688041.3227590.8023460.223618H
99952.311896-0.417070-1.409599-0.515821L
9996-0.479893-0.6504190.745152-0.646038E
99970.5233310.7871120.4860661.093156K
9998-0.3625590.598894-1.8432010.887292G
9999-0.096376-1.012999-0.657431-0.5733150

10000 rows × 5 columns


1
pd.read_csv('ch06/ex6.csv', nrows=5)


























































onetwothreefourkey
00.467976-0.038649-0.295344-1.824726L
1-0.3588931.4044530.704965-0.200638B
2-0.5018400.659254-0.421691-0.057688G
30.2048861.0741341.388361-0.982404R
40.354628-0.1331160.283763-0.837063Q

1
2
chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)
chunker
<pandas.io.parsers.TextFileReader at 0x2035229de80>
1
2
3
4
5
6
7
chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)
tot = Series([])
for piece in chunker:
tot = tot.add(piece['key'].value_counts(), fill_value=0)
tot = tot.sort_values(ascending=False)
1
tot[:10]
E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

将数据写出到文本格式

1
2
data = pd.read_csv('ch06/ex5.csv')
data














































somethingabcdmessage
0one123.04NaN
1two56NaN8world
2three91011.012foo

1
2
data.to_csv('ch06/out.csv')
!more ch06\out.csv
,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo
1
data.to_csv(sys.stdout, sep='|')
|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo
1
data.to_csv(sys.stdout, na_rep='NULL')
,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo
1
data.to_csv(sys.stdout, index=False, header=False)
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo
1
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])
a,b,c
1,2,3.0
5,6,
9,10,11.0
1
2
3
4
5
6
dates = pd.date_range('1/1/2000', periods=7)
dates
ts = Series(np.arange(7), index=dates)
ts
ts.to_csv('ch06/tseries.csv')
!more ch06\tseries.csv
DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07'],
              dtype='datetime64[ns]', freq='D')






2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
Freq: D, dtype: int32



2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6
1
Series.from_csv('ch06/tseries.csv', parse_dates=True)
2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
dtype: int64

手动处理分隔符格式

1
!more ch06\ex7.csv
"a","b","c"
"1","2","3"
"1","2","3","4"
1
2
3
4
import csv
f = open('ch06/ex7.csv')
reader = csv.reader(f)
1
2
for line in reader:
print(line)
['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3', '4']
1
2
3
4
5
6
7
8
9
10
lines = list(csv.reader(open('ch06/ex7.csv')))
header, values = lines[0], lines[1:]
for item in zip(*values):
print(item)
for h, v in zip(header, zip(*values)):
print(h, v)
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict
('1', '1')
('2', '2')
('3', '3')
a ('1', '1')
b ('2', '2')
c ('3', '3')





{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}
1
2
3
4
5
class my_dialect(csv.Dialect):
lineterminator = '\n'
delimiter = ';'
quotechar = '"'
quoting = csv.QUOTE_MINIMAL
1
2
3
4
5
6
with open('mydata.csv', 'w') as f:
writer = csv.writer(f, dialect=my_dialect)
writer.writerow(('one', 'two', 'three'))
writer.writerow(('1', '2', '3'))
writer.writerow(('4', '5', '6'))
writer.writerow(('7', '8', '9'))
14






6






6






6
1
!more mydata.csv
one;two;three
1;2;3
4;5;6
7;8;9

JSON数据

1
2
3
4
5
6
7
8
obj = """
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
{"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""
1
2
3
import json
result = json.loads(obj)
result
{'name': 'Wes',
 'pet': None,
 'places_lived': ['United States', 'Spain', 'Germany'],
 'siblings': [{'age': 25, 'name': 'Scott', 'pet': 'Zuko'},
  {'age': 33, 'name': 'Katie', 'pet': 'Cisco'}]}
1
asjson = json.dumps(result) # convert to json
1
2
siblings = DataFrame(result['siblings'], columns=['name', 'age'])
siblings

























nameage
0Scott25
1Katie33

XML和HTML: Web信息收集

NB. The Yahoo! Finance API has changed and this example no longer works

1
2
3
4
5
6
from lxml.html import parse
from urllib.request import urlopen
parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
doc = parsed.getroot()
1
2
links = doc.findall('.//a')
links[15:20]
[<Element a at 0x20352cad598>,
 <Element a at 0x20352cad5e8>,
 <Element a at 0x20352cad638>,
 <Element a at 0x20352cad688>,
 <Element a at 0x20352cad6d8>]
1
2
3
4
lnk = links[28]
lnk
lnk.get('href')
lnk.text_content()
<Element a at 0x20352cad9a8>






'/quote/NFLX?p=NFLX'






'NFLX'
1
2
urls = [lnk.get('href') for lnk in doc.findall('.//a')]
urls[-10:]
['//finance.yahoo.com/broker-comparison?bypass=true',
 'https://help.yahoo.com/kb/index?page=content&y=PROD_MAIL_ML&locale=en_US&id=SLN2310&actp=productlink',
 'http://help.yahoo.com/l/us/yahoo/finance/',
 'https://yahoo.uservoice.com/forums/382977',
 'http://info.yahoo.com/privacy/us/yahoo/',
 'http://info.yahoo.com/relevantads/',
 'http://info.yahoo.com/legal/us/yahoo/utos/utos-173.html',
 'http://twitter.com/YahooFinance',
 'http://facebook.com/yahoofinance',
 'http://yahoofinance.tumblr.com']
1
2
3
tables = doc.findall('.//table')
len(tables)
calls = tables[0]
1
1
rows = calls.findall('.//tr')
1
2
3
def _unpack(row, kind='td'):
elts = row.findall('.//%s' % kind)
return [val.text_content() for val in elts]
1
2
_unpack(rows[0], kind='th')
_unpack(rows[1], kind='td')
[]




---------------------------------------------------------------------------

IndexError                                Traceback (most recent call last)

<ipython-input-87-7d371ed47023> in <module>()
      1 _unpack(rows[0], kind='th')
----> 2 _unpack(rows[1], kind='td')


IndexError: list index out of range
1
2
3
4
5
6
7
from pandas.io.parsers import TextParser
def parse_options_data(table):
rows = table.findall('.//tr')
header = _unpack(rows[0], kind='th')
data = [_unpack(r) for r in rows[1:]]
return TextParser(data, names=header).get_chunk()
1
2
3
call_data = parse_options_data(calls)
put_data = parse_options_data(puts)
call_data[:10]

Parsing XML with lxml.objectify

1
2
3
4
5
from lxml import objectify
path = '.\ch06\mta_perf\Performance_MNR.xml'
parsed = objectify.parse(open(path))
root = parsed.getroot()
1
2
3
4
5
6
7
8
9
10
11
12
13
data = []
skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',
'DESIRED_CHANGE', 'DECIMAL_PLACES']
for elt in root.INDICATOR:
el_data = {}
for child in elt.getchildren():
if child.tag in skip_fields:
continue
el_data[child.tag] = child.pyval
data.append(el_data)
1
2
perf = DataFrame(data)
perf.ix[:10, 5:]


































































































































INDICATOR_UNITMONTHLY_ACTUALMONTHLY_TARGETPERIOD_MONTHPERIOD_YEARYTD_ACTUALYTD_TARGET
0%96.9951200896.995
1%9595220089695
2%96.9953200896.395
3%98.3954200896.895
4%95.8955200896.695
5%94.4956200896.295
6%96957200896.295
7%96.4958200896.295
8%93.7959200895.995
9%96.4951020089695
10%96.99511200896.195

二进制数据格式

1
2
3
frame = pd.read_csv('ch06/ex1.csv')
frame
frame.to_pickle('ch06/frame_pickle')










































abcdmessage
01234hello
15678world
29101112foo

1
pd.read_pickle('ch06/frame_pickle')










































abcdmessage
01234hello
15678world
29101112foo

使用HDF5格式

1
2
3
4
store = pd.HDFStore('mydata.h5')
store['obj1'] = frame
store['obj1_col'] = frame['a']
store
<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5
/obj1                frame        (shape->[3,5])
/obj1_col            series       (shape->[3])  
1
store['obj1']










































abcdmessage
01234hello
15678world
29101112foo

1
2
store.close()
os.remove('mydata.h5')

使用数据库

1
2
3
4
5
6
7
8
9
10
11
import sqlite3
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
c REAL, d INTEGER
);"""
con = sqlite3.connect(':memory:')
con.execute(query)
con.commit()
<sqlite3.Cursor at 0x2035487c880>
1
2
3
4
5
6
7
data = [('Atlanta', 'Georgia', 1.25, 6),
('Tallahassee', 'Florida', 2.6, 3),
('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()
<sqlite3.Cursor at 0x2035487c810>
1
2
3
cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows
[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]
1
2
import pandas.io.sql as sql
sql.read_sql('select * from test', con)






































abcd
0AtlantaGeorgia1.256
1TallahasseeFlorida2.603
2SacramentoCalifornia1.705