pandas 中与sql语句对应的功能

1.单列where条件

select * from data where id < 8;
df[df.id < 8]
df[df['id'] < 8]
df[df.created == pd.Timestamp('2013-08-24 11:37:35')]  
df.query("id < 8")
df.query('object_name == "C_COBJ#"')
df[df.B == df['C C']]
# The previous expression is equivalent to
df.query("B == `C C`")

2. 排序sort

select * from data order by object_type desc;
df.sort_values(by='object_type', ascending=False) 
select * from data order by object_type desc, object_name asc;
df.sort_values(by=['object_type', 'object_name'], ascending=[False, True])

3. 逻辑运算 and(&) or(|) not (-)

3-1. & and

select * from data where id  < 8  and object_name = 'OBJ$';
df[(df.id < 8) & (df.object_name == 'OBJ$')]
# The previous expression is equivalent to
df.query("id<8 and object_name== 'OBJ$'")

3-2. | or

select * from data where value>1 or itemid=139476;

data[(data['db_name'] == 'shhxbase') | (data['nodes'] > 1)]

data[(data.value >1) | ( data.itemid==139476)]

data.query("value > 1 or itemid == 139476")

3-3. - not

select * from data where not (value>1 or itemid=139476);

data[ - ((data.value >1) | ( data.itemid==139476))]

data.query( "value <= 1 and itemid != 139476")

4. null 判断

data[data.value.isna()]
or
data[data.value.notna()]

5. in or like

5-1. in

data[data.itemid.isin([139479, 139471])]
#not in
data[- data.itemid.isin([139479, 139471])]
data.query("itemid in (139479, 139471)")
data.query("itemid not in (139479, 139471)")

5-2. like

df[df.name.str.contains(r'Tablespace|Sysmetric')]

6. groupby

select itemid, count(1) from data group by itemid;
data.groupby('itemid').size()
data.groupby('itemid').count()
# select itemid, avg(value), count(1) from data group by itemid;
data.groupby('itemid').agg(dict(value=np.mean, itemid=np.size))
#select itemid, count(distinct value) from data group by itemid;
data.groupby('itemid').agg(dict(value=pd.Series.nunique))

7. join

# select * from emp, dept where emp.deptno = dept.deptno order empno desc;
# 方法1
pd.merge(emp, dept, on='deptno').sort_values(by='empno', ascending=False)
#方法2
pd.merge(emp, dept, left_on='deptno', right_on='deptno')
#方法3
emp.set_index('deptno').join(dept.set_index('deptno'), on='deptno')

8. union all

pd.concat([df1, df2])

9. union

pd.concat([df1, df2]).drop_duplicates()

10. update

update data set value=value+1 where value<1;

data.loc[data.value < 1, 'value'] += 1

11. drop/add column

#使用drop()函数,此函数有一个列表形参labels,写的时候可以加上labels=[xxx],也可以不加,列表内罗列要删除行或者列的名称,默认是行名称,如果要删除列,则要增加参数axis=1,

#drop columns

test_dict_df.drop(['id'], axis=1)
# 替换原数据
test_dict_df.drop(['id'], axis=1, inplace=True)
# 增加列
emp.insert(0, 'test', ['test']*14)
# 调整列的位置, 把empno调到最后,此操作会就地修改数据
emp.insert(len(emp.columns), 'empno', emp.pop('empno'))



12.delete row

df.drop([index])

13. insert

# 增加一行
emp.append(emp.iloc[2], ignore_index=True)
# df 增加多行
emp.append(emp.iloc[-3:], ignore_index=True)
#  增加内置列表list
# nan = np.nan
row = [7902, 'FORD', 'ANALYST', 7566.0, Timestamp('1981-12-03 00:00:00'), 3000.0, np.nan, 20]

# 方法1:使用参数ignore_index=True
row = pd.Series(row, index=emp.columns)
emp.append(row, ignore_index=True)
# 方法2:给series命名,对应emp的索引
row = pd.Series(row, index=emp.columns, name=14)
emp.append(row)
# 方法3 在指定位置插入行,第二行位置,插入索引为88的行
df = emp.reindex(emp.index.insert(1, 88), copy=False)
df.loc[88] = row

14.拆分为多行

数据

country score value label
China,US 100 1 a
Japan,EU 150 2 b
UK,Australia 120 3 c
Singapore,Netherland 90 4 d

实现结果:

score value label country
100 1 a China
100 1 a US
150 2 b Japan
150 2 b EU
120 3 c UK
120 3 c Australia
90 4 d Singapore
90 4 d Netherland
df = pd.DataFrame({'country': ['China,US', 'Japan,EU', 'UK,Australia', 'Singapore,Netherland'],
                       'score': [100, 150, 120, 90],
                       'value': [1, 2, 3, 4],
                       'label': list('abcd')})

df.to_sql('df', odb, index=False, 
dtype=dict(country=VARCHAR(30), score=NUMBER(8), value=NUMBER(8), label=VARCHAR(10)))
select regexp_substr(country, '[^,]+', 1, level) country, score, value, label
  from df
connect by value= prior value --这里的value为主键
       and prior dbms_random.value is not null
       and level <= regexp_count(country, ',') + 1;
df.drop('country', axis=1).join( 
df['country'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('country')
).reset_index(drop=True)