# Data_StackExchange_Python

``````import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from collections import defaultdict
from collections import Counter
from numpy import linalg as LA
import statsmodels.api as sm
import matplotlib.cm as cm
from datetime import datetime as dt
import sys
from os import listdir
from scipy.stats.stats import pearsonr
from matplotlib.dates import YearLocator
``````

StackExchange（以下简称SE）是世界上最大的专业性问答社区之一。最早只有一个StackOverflow，后来慢慢发展出其他的问答社区，现在一共有一百多社区。在这里可以看到所有社区。

[这个]问答给出了SE历史数据的下载地址。本文给出对SE数据的初步处理示例。

``````def dailyQA(site):
F = defaultdict(lambda:[0,0])
path='/Users/csid/Documents/bigdata/stackexchange/unzip/'
filename = path + site + '/Posts.xml'
with open(filename,'r') as f:
for line in f:
try:
label = line.split('PostTypeId=')[1][1:2]
day = line.split('CreationDate=')[1][1:11]
if label == '1':
F[day][0]+=1
if label == '2':
F[day][1]+=1
except:
pass
return F

#plot the monthly growth of sites in terms of Na and Nq
def plotMonth(site,ax,col):
M=defaultdict(lambda:np.array([0,0]))
f=F[site]
for i in f:
M[i[:7]]+=np.array(f[i])
ms=sorted(M.keys())[1:-1]
if len(ms)>3:
x,y = np.array([M[i] for i in ms]).T
mm=[dt.strptime(j,'%Y-%m') for j in ms]
#ax.vlines(mm[0], x[0], y[0],color=col,linestyle='-')
ax.fill_between(mm, x, y,color=col, alpha=0.1)
ax.plot(mm,x,color="white",linestyle='-',marker='',alpha=0.1)
ax.plot(mm,y,color="white",linestyle='-',marker='',alpha=0.1)

def plotMonthSpecial(site,ax,col):
M=defaultdict(lambda:np.array([0,0]))
f=F[site]
for i in f:
M[i[:7]]+=np.array(f[i])
ms=sorted(M.keys())[2:-1]
x,y = np.array([M[i] for i in ms]).T
mm=[dt.strptime(j,'%Y-%m') for j in ms]
ax.vlines(mm[0], x[0], y[0],color=col,linestyle='-')
ax.plot(mm,x,color=col,linestyle='-',marker='')
ax.plot(mm,y,color=col,linestyle='-',marker='')
``````

``````path='/Users/csid/Documents/bigdata/stackexchange/unzip/'
sites = [ f for f in listdir(path) if f[-1]=='m']
F={}
for i in sites:
flushPrint(sites.index(i))
F[i] = dailyQA(i)
``````

``````# plot good sites at first then plot bad sites
S={}
for i in sites:
q,a=zip(*F[i].values())
S[i]=sum(q),sum(a)
rsites=[i for i,j in sorted(S.items(),key=lambda x:-x[1][0])]
``````

``````fig = plt.figure(figsize=(12, 5),facecolor='white')
ax = plt.subplot(111)
years = YearLocator()
cmap = cm.get_cmap('PiYG', 10)
for i in rsites:
c = int(np.log(S[i][0])-5)
plotMonth(i,ax,cmap(c))
plotMonthSpecial('physics.stackexchange.com',ax,'RoyalBlue')
plotMonthSpecial('cooking.stackexchange.com',ax,'DarkOliveGreen')
ax.set_yscale('log')
ax.set_ylim(1,10**6)
ax.set_xlabel('Time')
ax.set_ylabel('Monthly increased N of Q&A')
ax.xaxis.set_major_locator(years)
smm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=0, vmax=10))
smm._A = []
cbaxes = fig.add_axes([0.15, 0.85, 0.5, 0.015])
cbar = plt.colorbar(smm,cax=cbaxes,orientation='horizontal')
plt.show()
``````

``````def userDailyAnswers(site):
C={}
filename = path + site + '/Posts.xml'
with open(filename,'r') as f:
for line in f:
try:
label = line.split('PostTypeId=')[1][1:2]
if label == '2':
date = line.split('CreationDate=')[1][1:11]
time = line.split('CreationDate=')[1][12:20]
author = int(line.split('OwnerUserId=')[1].split(r'"')[1])
questionID = int(line.split('ParentId=')[1].split(r'"')[1])
if date in C:
if author in C[date]:
C[date][author]+=[(time,questionID)]
else:
C[date][author]=[(time,questionID)]
else:
C[date]={author:[(time,questionID)]}
except:
pass
return C

# calculate entropy of path angles
def entropy(G,O,K,T):
angles=[]
for i,j in G.edges():
#wi = G[i][j]['weight']
dx,dy = np.array([O[j],K[j]])-np.array([O[i],K[i]])
dis = LA.norm(np.array([O[j],K[j]])-np.array([O[i],K[i]]))
if dy>=0:
angle = np.round(180*np.arccos(dx/dis)/np.pi,1)
else:
angle = 360-np.round(180*np.arccos(dx/dis)/np.pi,1)
angles.append(angle)
l = len(angles)
ps=np.array(Counter(angles).values())
ps=ps/float(ps.sum())
#ent = -(ps*np.log(ps)).sum()/np.log(l)
ent = -(ps*np.log(ps)).sum()
return ent

def getSiteFlowdata(site):
days=sorted(C.keys())
E=defaultdict(lambda:0)
n=0
maxuser=100
for day in days[len(days)/2:]:
d = C[day]
f = sorted(d.items(),key=lambda x:x[1])
for i,j in f:
if n<maxuser:
n+=1
q = [p for o,p in j]
q = ['source']+q+['sink']
for a,b in zip(q[:-1],q[1:]):
E[(a,b)]+=1

G=nx.DiGraph()
for x,y in E:
w = E[(x,y)]
O = flowDistanceFromSource(G)
K = flowDistanceToSink(G)
T = G.out_degree(weight='weight')
return G,O,K,T

# orthogonal okplot
def okplot(G,O,K,T):
plt.plot([0,4],[0,4],'r-',alpha=0.5)
for i,j in G.edges():
wi = G[i][j]['weight']
x1,y1=O[i],K[i]
x2,y2=O[j],K[j]
dx=x2-x1
dy=y2-y1
#plt.text(x2,y2,wi,color='brown')
plt.xlabel(L_{oi},size=16)
plt.ylabel(L_{ik},size=16)

# rescaled orthogonal okplot
def rescaledokplot(G,O,K,T):
r = 0
Dx=0;Dy=0
tr=0
for i,j in G.edges():
wi = G[i][j]['weight']
x1,y1=O[i],K[i]
x2,y2=O[j],K[j]
dx=x2-x1
dy=y2-y1
Dx+=dx
Dy+=dy
rr = np.sqrt(dx**2+dy**2)
tr+=rr
if rr>r:
r=rr
lim=2
plt.xlim(-lim,lim)
plt.ylim(-lim,lim)
``````

``````i='physics.stackexchange.com'
j='cooking.stackexchange.com'
G1,O1,K1,T1=getSiteFlowdata(i)
G2,O2,K2,T2=getSiteFlowdata(j)
# okplot demo
fig = plt.figure(figsize=(12, 6),facecolor='white')
ax = plt.subplot(121)
okplot(G1,O1,K1,T1)
ax = plt.subplot(122)
okplot(G2,O2,K2,T2)
plt.tight_layout()
plt.show()
``````

``````entropy(G1,O1,K1,T1),entropy(G2,O2,K2,T2)
``````

``````# construct network and calculate path entropy
D={}
for site in sites:
if site=='ebooks.stackexchange.com' or site=='stackoverflow.com':
continue
flushPrint(sites.index(site))
days=sorted(C.keys())
E=defaultdict(lambda:0)
n=0
maxuser=100
for day in days[len(days)/2:]:
d = C[day]
f = sorted(d.items(),key=lambda x:x[1])
for i,j in f:
if n<maxuser:
n+=1
q = [p for o,p in j]
q = ['source']+q+['sink']
for a,b in zip(q[:-1],q[1:]):
E[(a,b)]+=1
G=nx.DiGraph()
for x,y in E:
w = E[(x,y)]
O = flowDistanceFromSource(G)
K = flowDistanceToSink(G)
T = G.out_degree(weight='weight')
D[site]=entropy(G,O,K,T)

l,a,q=np.array([(D[i],S[i][0],S[i][1]) for i in D if i in S and i!='aviation.stackexchange.com']).T
cs,beta,r2=OLSRegressFit(l,np.log(q))
fig = plt.figure(figsize=(8, 8))
plt.plot(l,q,linestyle='',marker='s',color='RoyalBlue',label='N of Questions')
plt.plot(l,np.exp(cs+beta*l),linestyle='-',marker='',color='Brown')
plt.yscale('log')
plt.legend(loc=1,numpoints=1)
plt.xlabel('Entropy of angles', size=16)
plt.ylabel('N of Questions & Answers', size=16)
plt.show()
``````

``````pearsonr(l,np.log(q))
``````

### 推荐阅读更多精彩内容

• Android 自定义View的各种姿势1 Activity的显示之ViewRootImpl详解 Activity...
passiontim阅读 157,821评论 24 688
• 能走开的都不是最爱，走不开的是命定。
清顾阅读 53评论 0 1
• 昨天晚上，你对我说：“妈妈，我想再补一门课，我只补了一门新概念英语是不够的。”孩子，妈妈很为你这种努力求上进的学习...
生活馈赠与我阅读 124评论 2 3
• 我国发现的最早的钓鱼文物是陕西省西安半坡村发现的骨制鱼钓和黑龙江小兴凯湖岗上出土的骨制鱼钩，距今大约有六千...
文澄澈阅读 236评论 0 3