pandas 1. basic and attributes

https://pandas.pydata.org/docs/ 참고

## pandas 1. basic and attributes

## 예제 5_1

```python

'''

pandas Series : basic and attributes

'''

import numpy as np

import pandas as pd

def testPd1():

ser_a = pd.Series(

np.array([1,2,3,4,5]), # data

index=['a','b','c','d','e'], # index - key값은 다 달라야 한다

dtype=np.float64) # element data type

#print('series a[]', ser_a, sep='\n')

#print('The index structure')

#print(ser_a.index)

print('1:', ser_a[1])

print('2:', ser_a['b'])

# attributes

print('ndim',ser_a.ndim,'size',ser_a.size,'nbytes',ser_a.nbytes,'shape',ser_a.shape,sep='\n')

print('values :',ser_a.values)

print('empty :',ser_a.empty)

# at, iat, loc, iloc

print('1-th element:',ser_a.at['a'])

print('1-th element:',ser_a.iat[1])

print('group element:',ser_a.loc[:'c'],sep='\n')

print('group element:',ser_a.iloc[:2],sep='\n')

# index가 없는 경우

ser_a1 = pd.Series(['1','2','3','4'])

print(ser_a1[0])

print(ser_a1.at[2])

print(ser_a1.iat[2]) # 기본값이 숫자로 01234인듯?

return None

def testPd2():

df_A = pd.DataFrame(

[[1,2,3,4],[2,3,4,5]], # data

#index=['1st','2nd'], # index(row)

columns=['col1','col2','col3','col4'] # columns

)

print('df_A :\n',df_A)

# implicit

# print(df_A[1])

print(df_A['col2']) # col 단위로 먼저 잘라야된다

# Series를 가져오게 된다

print(df_A['col2'][1]) #Series로 가져오면 인덱스로 접근 가능하지만

# explicit

#print(df_A.at['1st','col2'])

print(df_A.loc[:,:'col2'].iat[1,1])

# attribute임에 유의 - 결과가 값이다. doc 참고

# index도 attribute - 결과가 값이다.

print(df_A.index) # range index..

return None

def main():

testPd1()

print('---------------')

testPd2()

return None

if __name__ == '__main__':

main()

```

1: 2.0

2: 2.0

ndim

size

nbytes

shape

(5,)

values : [1. 2. 3. 4. 5.]

empty : False

1-th element: 1.0

1-th element: 2.0

group element:

a 1.0

b 2.0

c 3.0

dtype: float64

group element:

a 1.0

b 2.0

dtype: float64

---------------

df_A :

col1 col2 col3 col4

0 1 2 3 4

1 2 3 4 5

0 2

1 3

Name: col2, dtype: int64

RangeIndex(start=0, stop=2, step=1)

```

## 예제 5_2

```python

import numpy as np

import pandas as pd

# read a CSV file => inject the CSV file into a Dataframe

def testpd1():

# new data type in form of dictionary

dt_crimes = {

# 'column_name' : data_type

'cdatetime': np.str_,

'address': np.str_,

'district':np.int8,

'beat':np.object_,

'grid':np.str_,

'crimedescr':np.str_,

'ucr_ncic_code':"S4",

'latitude':np.float32,

'longitude':np.float32

}

crime_csv_path = "./SacramentocrimeJanuary2006.csv"

df_csv = pd.read_csv(crime_csv_path,dtype=dt_crimes,sep=",")

print('head :\n',df_csv.head())

# 유닉스 명령어 head tail default 5줄

print(df_csv.dtypes)

#print('Dataframd Size = ',df_csv.memory_usage())

#print(df_csv.size)

print('length =',df_csv.iloc[0,:].nbytes) # Series 의 attribute

return None

def testpd2():

csv_path = './ratings.csv'

dt_ratings = {

# userId,movieId,rating,timestamp

'userId':'S8',

'movieId':'S8',

'rating':np.float16,

'timestamp':np.int64

}

df_ratings = pd.read_csv(csv_path,dtype=dt_ratings)

#print(df_ratings.tail())

#print(df_ratings.dtypes)

print(df_ratings.index)

print(df_ratings.columns)

row_1 = df_ratings.iloc[1]

print('row_1 :\n',row_1)

print('row_1[1] :\n',row_1[1]) # 한줄이라서 Series로 변해서 가능. 그러나 쓰지말것. iat iloc 쓰기

print('rating' in row_1)

print(row_1['rating'])

return None

def testpd3():

crime_xls_file = "./SacramentocrimeJanuary2006.xls"

df_csv = pd.read_excel(crime_xls_file)

print('head :\n',df_csv.head())

return None

def main():

testpd1()

print('--------------')

testpd2()

print('--------------')

testpd3()

return None

if __name__ == '__main__':

main()

```

head :

cdatetime address district ... ucr_ncic_code latitude longitude

0 1/1/06 0:00 3108 OCCIDENTAL DR 3 ... b'2404' 38.550419 -121.391418

1 1/1/06 0:00 2082 EXPEDITION WAY 5 ... b'2204' 38.473499 -121.490189

2 1/1/06 0:00 4 PALEN CT 2 ... b'2404' 38.657845 -121.462097

3 1/1/06 0:00 22 BECKFORD CT 6 ... b'2501' 38.506775 -121.426949

4 1/1/06 0:00 3421 AUBURN BLVD 2 ... b'2299' 38.637447 -121.384613

[5 rows x 9 columns]

cdatetime object

address object

district int8

beat object

grid object

crimedescr object

ucr_ncic_code object

latitude float32

longitude float32

dtype: object

length = 72

--------------

RangeIndex(start=0, stop=25000095, step=1)

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

row_1 :

userId b'1'

movieId b'306'

rating 3.5

timestamp 1147868817

Name: 1, dtype: object

row_1[1] :

b'306'

True

3.5

--------------

head :

cdatetime address district ... ucr_ncic_code latitude longitude

0 2001-01-06 00:00:00 3108 OCCIDENTAL DR 3 ... 2404 38.550420 -121.391416

1 2001-01-06 00:00:00 2082 EXPEDITION WAY 5 ... 2204 38.473501 -121.490186

2 2001-01-06 00:00:00 4 PALEN CT 2 ... 2404 38.657846 -121.462101

3 2001-01-06 00:00:00 22 BECKFORD CT 6 ... 2501 38.506774 -121.426951

4 2001-01-06 00:00:00 3421 AUBURN BLVD 2 ... 2299 38.637448 -121.384613

[5 rows x 9 columns]

```

## 예제 5_3 (loc,iloc,at,iat)

```python

import numpy as np

import pandas as pd

def testpd1():

#crime_loc

crime_loc = {

'latitude':[38.65042047,

37.47350069,

38.65784584,

38.50677377],

'longitude':[-121.3914158,

-121.4901858,

-121.4621009,

-121.4269508]

}

#df_crime_loc

df_crime_loc = pd.DataFrame(crime_loc)

#print(df_crime_loc)

#print(df_crime_loc.head())

print(df_crime_loc.describe())

print('')

'''

sr_df_crime_loc_lat = df_crime_loc['latitude']

# descriptive statistics

# min max mean std mode median

sr_min = sr_df_crime_loc_lat.min()

print(sr_min)

sr_mean = sr_df_crime_loc_lat.mean()

print(sr_mean)

sr_mode = sr_df_crime_loc_lat.mode()

print(sr_mode)

sr_median = sr_df_crime_loc_lat.median()

print(sr_median)

# 행을 가져오기

tmp = df_crime_loc.iloc[1]

print(tmp.mean())

'''

# list the ranking

print(df_crime_loc[['latitude']].rank())

# crime location.latitude > 38.3

# df_crime_loc['check'] = df_crime_loc[['latitude']] > 38.6 # 열 하나를 추가한다.

df_crime_loc_tag = df_crime_loc[['latitude']] > 38.6

print(df_crime_loc_tag)

print(df_crime_loc_tag.any())

# result

print(df_crime_loc[df_crime_loc_tag])

return None

def main():

testpd1()

return None

if __name__ == '__main__':

main()

```

latitude longitude

count 4.000000 4.000000

mean 38.322135 -121.442663

std 0.570013 0.042854

min 37.473501 -121.490186

25% 38.248456 -121.469122

50% 38.578597 -121.444526

75% 38.652277 -121.418067

max 38.657846 -121.391416

latitude

0 3.0

1 1.0

2 4.0

3 2.0

latitude

0 True

1 False

2 True

3 False

latitude True

dtype: bool

latitude longitude

0 38.650420 NaN

1 NaN NaN

2 38.657846 NaN

3 NaN NaN

```

## 예제 5_4 (trim)

```python

import numpy as np

import pandas as pd

# read a CSV file => inject the CSV file into a Dataframe

def testpd1():

crime_csv_path = "./SacramentocrimeJanuary2006.csv"

df_csv = pd.read_csv(crime_csv_path,sep=",")

#print(df_crime)

#print(df_csv.head())

# trim a Series named 'address'

sr_x = df_csv[['address']]

print(sr_x)

#trim a DataFrame named 'address, crimedescr, grid'

y = df_csv[['address','crimedescr','grid']]

print(y)

return None

def main():

testpd1()

return None

if __name__ == '__main__':

main()

```

address

0 3108 OCCIDENTAL DR

1 2082 EXPEDITION WAY

2 4 PALEN CT

3 22 BECKFORD CT

4 3421 AUBURN BLVD

... ...

7579 26TH ST / G ST

7580 4011 FREEPORT BLVD

7581 30TH ST / K ST

7582 5303 FRANKLIN BLVD

7583 COBBLE COVE LN / COBBLE SHORES DR

[7584 rows x 1 columns]

address crimedescr grid

0 3108 OCCIDENTAL DR 10851(A)VC TAKE VEH W/O OWNER 1115

1 2082 EXPEDITION WAY 459 PC BURGLARY RESIDENCE 1512

2 4 PALEN CT 10851(A)VC TAKE VEH W/O OWNER 212

3 22 BECKFORD CT 476 PC PASS FICTICIOUS CHECK 1443

4 3421 AUBURN BLVD 459 PC BURGLARY-UNSPECIFIED 508

... ... ... ...

7579 26TH ST / G ST 594(B)(2)(A) VANDALISM/ -$400 728

7580 4011 FREEPORT BLVD 459 PC BURGLARY BUSINESS 957

7581 30TH ST / K ST TRAFFIC-ACCIDENT INJURY 841

7582 5303 FRANKLIN BLVD 3056 PAROLE VIO - I RPT 969

7583 COBBLE COVE LN / COBBLE SHORES DR TRAFFIC-ACCIDENT-NON INJURY 1294

[7584 rows x 3 columns]

```

noir1458's blog

pandas 1. basic and attributes

작성자: noir1458

댓글 쓰기

0 댓글

Categories

study

Computer Science

Programming

Problem Solving

Math

Tags