PYTHON視覺化:瞭解資料

CopperDong發表於2017-09-30

一、簡介

       涵蓋匯入和匯出各種格式資料的基本知識,還包括清理資料的方式,比如值的歸一化處理、缺失資料的新增、實時資料檢查以及一些類似的技巧,以便正確地準備資料來進行視覺化

二、從CSV檔案匯入資料

       以逗號分隔的值(檔案中還包括一個檔案頭,也是以逗號分隔的)

#!/usr/bin/env python
import csv
filename = 'ch02-data.csv'
data = []
try:
	with open(filename) as f:
		reader = csv.reader(f)
		c = 0
		for row in reader:
			if c == 0:
				header = row
			else:
				data.append(row)
			c += 1
except csv.Error as e:
	print "Error reading CSV file at line %s: %s" % (reader.line_num, e)
	sys.exit(-1)

if header:
	print header
	print '=================='
for datarow in data:
	print datarow
     如果處理大資料檔案可以用numpy.loadtxt

import numpy
data = numpy.loadtxt('ch02-data.csv',dtype='string',delimiter=',')
三、從Microsoft Excel檔案中匯入資料

      通常做法是把資料從Excel中匯出到CSV格式的檔案中

      PYTHON中xlrd模組 ( pip install xlrd)

#!/usr/bin/env python 
import xlrd
file='ch02-xlsxdata.xlsx'
wb = xlrd.open_workbook(filename=file)
ws = wb.sheet_by_name('Sheet1')
dataset = []
for r in range(ws.nrows):
	col = []
	for c in range(ws.ncols):
		col.append(ws.cell(r,c).value)
	dataset.append(col)
from pprint import pprint
pprint(dataset)

四、從定寬資料檔案匯入資料

  檔案中的欄位是有固定寬度的

  可用Python中的struct模組

import struct 
import string

datafile = 'ch02-fixed-width-1M.data'

mask='9s14s5s'
with open(datafile, 'r') as f:
	for line in f:
		fields = struct.Struct(mask).unpack_from(line)
		print 'fields: ', [field.strip() for field in fields] #strip()可以去掉每個欄位的前導和後導空格

五、從製表符分隔的檔案中讀取資料

  它可能匯出自Excel檔案,也可能是一些定製軟體的輸出

  可以用CSV模組

import csv
filename = 'ch02-data.tab'
data = []
try:
    with open(filename) as f:
        reader = csv.reader(f, dialect=csv.excel_tab)
        c = 0
        for row in reader:
            if c == 0:
                header = row
            else:
                data.append(row)
            c += 1
except csv.Error as e:
    print "Error reading CSV file at line %s: %s" % (reader.line_num, e)
    sys.exit(-1)

if header:
    print header
    print '==================='

for datarow in data:
    print datarow
六、從JSON資料來源匯入資料

  JavaScript Object Notation(JSON)作為一種平臺無關的格式被廣泛地應用於系統間或者應用間的資料交換

  使用requests模組獲取資料

七、匯出資料到JSON、CSV和Excel

  寫入資料

  例項:

八、從資料庫匯入資料

  使用SQL drivers

九、清理異常值

  is_outlier

十、讀取大塊資料檔案


十一、讀取流資料來源

   import os

十二、匯入影象資料到NumPy陣列

   接下來會介紹如何用NumPy和SciPy這兩種Python庫來做影象處理

import scipy.misc
import matplotlib.pyplot as plt
lena = scipy.misc.lena()
plt.gray()
plt.imshow(lena)
plt.colorbar()
plt.show()
十三、生成可控的隨機資料集合

  可以用假資料來了解統計方法是不是能夠得到我們想要的模型。因為已經預先知道了模型,所以我們可以把統計方法應用到已知資料上進行驗證。在真實場景下,我們是沒辦法做到這一點的,因為我們必須要估計到,總會有一定程度的不確定性因素存在,可能導致錯誤的結果。

  使用random模組

#均勻分佈
import pylab
import random
SAMPLE_SIZE = 10000
random.seed()  # seed random generator
real_rand_vars=[]
real_rand_vars=[random.random() for val in xrange(SAMPLE_SIZE)]
pylab.hist(real_rand_vars, 10) # create histogram from data in 10 buckets
pylab.xlabel("Number range")
pylab.ylabel("Count")
pylab.show()
# 加入高斯噪聲
import pylab
import random
duration=100  #
mean_inc = 0.2 #mean value
std_dev_inc = 1.2 # standard_deviation

x = range(duration)
y = []
price_today = 0

for i in x:
	next_delta = random.normalvariate(mean_inc, std_dev_inc)
	price_today += next_delta
	y.append(price_today)

pylab.plot(x, y)
pylab.xlabel("Time")
pylab.ylabel("Value")
pylab.show()
    如果想要更多的控制,可以使用不同的分佈

# coding: utf-8
import random
import matplotlib
import matplotlib.pyplot as plt

SAMPLE_SIZE = 1000

# histogram buckets
buckets = 100

plt.figure()

# we need to update font size just for this example
matplotlib.rcParams.update({'font.size': 7})

plt.subplot(621)
plt.xlabel("random.random")
# Return the next random floating point number in the range [0.0, 1.0).
res = []
for _ in xrange(1, SAMPLE_SIZE):
        res.append(random.random())
plt.hist(res, buckets)

plt.subplot(622)
plt.xlabel("random.uniform")
# Return a random floating point number N such that a <= N <= b for a <= b and b <= N <= a for b < a.
# The end-point value b may or may not be included in the range depending on floating-point rounding in the equation a + (b-a) * random().
a = 1 
b = SAMPLE_SIZE
res = []
for _ in xrange(1, SAMPLE_SIZE):
    res.append(random.uniform(a, b))
plt.hist(res, buckets)

plt.subplot(623)
plt.xlabel("random.triangular")
# Return a random floating point number N such that low <= N <= high and with the specified mode between those bounds. The low and high bounds default to zero and one. The mode argument defaults to the midpoint between the bounds, giving a symmetric distribution.
low = 1
high = SAMPLE_SIZE
res = []
for _ in xrange(1, SAMPLE_SIZE):
    res.append(random.triangular(low, high))
plt.hist(res, buckets)

plt.subplot(624)
plt.xlabel("random.betavariate")
# Beta distribution. Conditions on the parameters are alpha > 0 and beta > 0. Returned values range between 0 and 1.
alpha = 1
beta = 10
res = []
for _ in xrange(1, SAMPLE_SIZE):
    res.append(random.betavariate(alpha, beta))
plt.hist(res, buckets)

plt.subplot(625)
plt.xlabel("random.expovariate")
# Exponential distribution. lambd is 1.0 divided by the desired mean. It should be nonzero. (The parameter would be called “lambda”, but that is a reserved word in Python.) Returned values range from 0 to positive infinity if lambd is positive, and from negative infinity to 0 if lambd is negative.
lambd = 1.0 / ((SAMPLE_SIZE + 1) / 2.)
res = []
for _ in xrange(1, SAMPLE_SIZE):
    res.append(random.expovariate(lambd))
    
plt.hist(res, buckets)

plt.subplot(626)
plt.xlabel("random.gammavariate")
# Gamma distribution. (Not the gamma function!) Conditions on the parameters are alpha > 0 and beta > 0.
# The probability distribution function is:
#
#           x ** (alpha - 1) * math.exp(-x / beta)
# pdf(x) =  --------------------------------------
#             math.gamma(alpha) * beta ** alpha
alpha = 1
beta = 10
res = []
for _ in xrange(1, SAMPLE_SIZE):
        res.append(random.gammavariate(alpha, beta))
plt.hist(res, buckets)

plt.subplot(627)
plt.xlabel("random.lognormvariate")
# Log normal distribution. If you take the natural logarithm of this distribution, you’ll get a normal distribution with mean mu and standard deviation sigma. mu can have any value, and sigma must be greater than zero.
mu = 1
sigma = 0.5
res = []
for _ in xrange(1, SAMPLE_SIZE):
    res.append(random.lognormvariate(mu, sigma))
plt.hist(res, buckets)

plt.subplot(628)
plt.xlabel("random.normalvariate")
# Normal distribution. mu is the mean, and sigma is the standard deviation.
mu = 1
sigma = 0.5
res = []
for _ in xrange(1, SAMPLE_SIZE):
    res.append(random.normalvariate(mu, sigma))
plt.hist(res, buckets)

plt.subplot(629)
plt.xlabel("random.paretovariate")
# Pareto distribution. alpha is the shape parameter.
alpha = 1
res = []
for _ in xrange(1, SAMPLE_SIZE):
    res.append(random.paretovariate(alpha))
plt.hist(res, buckets)

plt.tight_layout()
plt.show()
十四、真實資料的噪聲平滑處理

  清理真實資料來源的資料,這些演算法在訊號處理領域很有名

  基礎演算法是基於滾動視窗模式(例如卷積)

from pylab import *
from numpy import *

def moving_average(interval, window_size):
    '''
    Compute convoluted window for given size
    '''
    window = ones(int(window_size)) / float(window_size)
    return convolve(interval, window, 'same')

t = linspace(-4, 4, 100)
y = sin(t) + randn(len(t))*0.1

plot(t, y, "k.")

# compute moving average
y_av = moving_average(y, 10)
plot(t, y_av,"r")
#xlim(0,1000)

xlabel("Time")
ylabel("Value")
grid(True)
show()

 


相關文章