Pandas Tips: Useful Usage and Techniques#

Treat the String ‘NA’ as NA#

keep_default_na=False

How to Change Line Endings to LF in Windows#

line_terminator='\n'

How to Handle Data in Shift_JIS Encoding#

encoding='cp932'

CSVのformat#

quoting=csv.QUOTE_ALL
quoting=csv.QUOTE_MINIMAL
quoting=csv.QUOTE_NONNUMERIC
quoting=csv.QUOTE_NONE

Remove Duplicate Rows#

df.drop_duplicates()

Add a Column to DataFrame#

df['name_state'] = df['name'].str.cat(df['state'], sep=' in ')

Rename Columns in DataFrame#

df_new = df.rename(columns={'A': 'Col_1', 'C': 'Col_3'})

Merge DataFrames#

print(df_ab.merge(df_ac))
print(pd.merge(df_ab, df_ac, on='a'))
print(pd.merge(df_ab, df_ac, on='a', how='outer'))

Filtering Rows Based on a Column Value Like WHERE Clause#

print(df['state'] == 'CA')

String Search in DataFrame#

# 特定の文字列を含む
print(df[df['name'].str.contains('li')])

# 特定の文字列で終わる
print(df[df['name'].str.endswith('li')])

# 特定の文字列で始まる
print(df[df['name'].str.startswith('li')])

# 正規表現のパターンに一致する
print(df[df['name'].str.match('li')])

Control Display of DataFrames#

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_seq_items', 500)

Reset Options in pandas#

pd.reset_option('display.max_seq_items')

Modify Values Based on Conditions#

df.loc[df['A'] < 0, 'A'] = -10

Round Up Decimal Places#

# float計算値をintに変換したものとの差が0以上のもの1プラスする

Split Output CSV into Multiple Files#

k = 10000  # 1DataFrameあたりの行数
dfs = [df.loc[i:i+k-1, :] for i in range(0, len(df), k)]
for i, df_i in enumerate(dfs):
  fname = str(i) + ".csv"
  df_i.to_csv(fname)