2.3. 数据编码和处理
2.3.1. 读写CSV数据
如果你读取CSV数据的目的是做数据分析和统计的话, 你可能需要看一看 Pandas 包。Pandas 包含了一个非常方便的函数叫 pandas.read_csv()
import csv
from collections import namedtuple
# 列表方式
with open('./stocks.csv') as f:
f_csv = csv.reader(f)
headers = next(f_csv)
print('headers:{}'.format(headers))
for row in f_csv:
print(row[0])
# 命名元组方式
with open('./stocks.csv') as f:
f_csv = csv.reader(f)
headers = next(f_csv)
print('headers:{}'.format(headers))
Row = namedtuple('Row',headers)
for r in f_csv:
row = Row(*r)
print(row.Symbol)
# 字典方式
with open('./stocks.csv') as f:
f_csv = csv.DictReader(f)
headers = next(f_csv)
print('headers:{}'.format(headers))
for row in f_csv:
print(row['Symbol'])
headers = ['Symbol','Price','Date','Time','Change','Volume']
rows = [('AA', 39.48, '6/11/2007', '9:36am', -0.18, 181800),
('AIG', 71.38, '6/11/2007', '9:36am', -0.15, 195500),
('AXP', 62.58, '6/11/2007', '9:36am', -0.46, 935000),
]
import csv
def write_list_to_csv(filename,headers,rows):
with open(filename,'w') as f:
f_csv = csv.writer(f)
f_csv.writerow(headers)
f_csv.writerows(rows)
write_list_to_csv('s1.csv',headers,rows)
rows2 = [{'Symbol':'AA', 'Price':39.48, 'Date':'6/11/2007',
'Time':'9:36am', 'Change':-0.18, 'Volume':181800},
{'Symbol':'AIG', 'Price': 71.38, 'Date':'6/11/2007',
'Time':'9:36am', 'Change':-0.15, 'Volume': 195500},
{'Symbol':'AXP', 'Price': 62.58, 'Date':'6/11/2007',
'Time':'9:36am', 'Change':-0.46, 'Volume': 935000},
]
def write_dict_to_csv(filename,headers,rows):
with open(filename,'w') as f:
f_csv = csv.DictWriter(f,headers)
f_csv.writeheader()
f_csv.writerows(rows)
write_dict_to_csv('s2.csv',headers,rows2)
2.3.2. 读写JSON数据
可以考虑使用pprint模块的 pprint() 函数来代替普通的 print() 函数。 它会按照key的字母顺序并以一种更加美观的方式输出。
import json
def write_json(data):
with open('w1.json','w') as f:
json.dump(data,f)
write_json(data = {
'name' : 'ACME',
'shares' : 100,
'price' : 542.23
}
)
def read_json(filename):
with open(filename,'r')as f:
data = json.load(f)
return data
print(read_json('w1.json'))
s = '{"name": "ACME", "shares": 50, "price": 490.1}'
import json
from collections import OrderedDict
result = json.loads(s,object_pairs_hook=OrderedDict)
print(result)
class MyJosnObj(object):
def __init__(self,d) -> None:
self.__dict__ = d
result = json.loads(s,object_hook=MyJosnObj)
print(result.name)
class Point(object):
def __init__(self,x,y) -> None:
self.x = x
self.y = y
def serialize_instance(point_obj):
d = {'__classname__': type(point_obj).__name__}
d.update(vars(point_obj))
return d
classMap = {
'Point' : Point
}
def unserialize_object(d):
clsname = d.pop('__classname__',None)
if clsname is None:
return d
cls = classMap[clsname]
obj = cls.__new__(cls)
for k,v in d.items():
setattr(obj,k,v)
return obj
p = Point(2,3)
import json
s = json.dumps(p, default=serialize_instance)
print(s)
p = json.loads(s,object_hook=unserialize_object)
print(p.x)
2.3.3. 解析简单的XML数据
可以使用 xml.etree.ElementTree
模块从简单的XML文档中提取数据。
对于更高级的应用程序,你需要考虑使用 lxml
from urllib.request import urlopen
from xml.etree.ElementTree import parse
# Download the RSS feed and parse it
u = urlopen('http://planet.python.org/rss20.xml')
doc = parse(u)
# Extract and output tags of interest
for item in doc.iterfind('channel/item'):
title = item.findtext('title')
date = item.findtext('pubDate')
link = item.findtext('link')
print(title)
print(date)
print(link)
print()
2.3.4. 增量式解析大型XML文件
from lxml import etree
from io import StringIO
class ElementHandler:
def start(self, tag, attrib):
self.current_tag = tag
print("start tag:{}".format(self.current_tag))
def end(self, tag):
if tag == self.current_tag:
print("end tag:{}".format(self.current_tag))
def data(self, data):
print('Data:', data)
def close(self):
print('End of document')
handler = ElementHandler()
parser = etree.XMLParser(target=handler)
xml_data = """
<root>
<element key="value">Text content</element>
<element key="another_value">Another text content</element>
</root>
"""
etree.parse(StringIO(xml_data), parser)
2.3.5. 将字典转换为XML
s = { 'name': 'GOOG', 'shares': 100, 'price':490.1 }
from xml.etree.ElementTree import Element
from xml.etree.ElementTree import tostring
def dict2xml(tag,d):
'''
将字典转换为XML
'''
xml = ['<{}>'.format(tag)]
for k,v in d.items():
xml.append('<{0}>{1}</{0}>'.format(k,v))
xml.append('</{}>'.format(tag))
return '\n'.join(xml)
def dict2xmlv2(tag,d):
elem = Element(tag)
for k,v in d.items():
child = Element(k)
child.text = v
elem.append(child)
return elem
print(dict2xml("root",s))
e = dict2xmlv2("root",s)
print(tostring(e))
2.3.6. 解析和修改XML
修改一个XML文档结构是很容易的,但是你必须牢记的是所有的修改都是针对父节点元素, 将它作为一个列表来处理。例如,如果你删除某个元素,通过调用父节点的 remove() 方法从它的直接父节点中删除。 如果你插入或增加新的元素,你同样使用父节点元素的 insert() 和 append() 方法。 还能对元素使用索引和切片操作,比如 element[i] 或 element[i:j]
2.3.7. 利用命名空间解析XML文档
todo , 建议使用 lxml
解析。
2.3.8. 与关系型数据库的交互
stocks = [
('GOOG', 100, 490.1),
('AAPL', 50, 545.75),
('FB', 150, 7.45),
('HPQ', 75, 33.2),
]
import sqlite3
db = sqlite3.connect('database.db')
c = db.cursor()
c.execute('create table portfolio (symbol text, shares integer, price real)')
db.commit()
c.executemany('insert into portfolio values (?,?,?)', stocks)
db.commit()
for row in db.execute('select * from portfolio'):
print(row)
2.3.9. 编码和解码十六进制数
函数 base64.b16decode()
和 base64.b16encode()
只能操作大写形式的十六进制字母, 而 binascii
模块中的函数大小写都能处理。
In [1]: s = b'hello'
In [2]: import binascii
In [3]: h = binascii.b2a_hex(s)
In [4]: h
Out[4]: b'68656c6c6f'
In [5]: binascii.a2b_hex(h)
Out[5]: b'hello'
2.3.10. 编码解码Base64数据
base64 模块中有两个函数 b64encode()
and b64decode()