HOME LAB : HANDS-ON
- Get link
- X
- Other Apps
- Get link
- X
- Other Apps
#Python basics - comments for single lines
'''
Multiple line comments used wiith 3 single quotes
'''
print('''
Multiple line comments used wiith 3 single quotes
Python basics - comments for single lines
''')
## line continuation \n
total1 = 1+2+3+4+\
4+5+7
print(total1)
print("")
Multiple line comments used wiith 3 single quotes Python basics - comments for single lines 26
##variables
## declaring and assiging variables
age=32
height=5.5
name="student"
is_student=True
print("age:",age)
print("height:",height)
print("name:",name)
age: 32 height: 5.5 name: student
## naming conventions
## variables names = should be decriptive
## they must always start with a letter or '_' a and then follwed by letters, numbers&underscores
## variables are casesentive
first_name="student"
last_name="Jhon"
##invalid names
#2age=32
#irst-name="student"
#@height=32
## Python is dynamatically variable = means types of the ariable is determined during the runtime
age=25 #int
height=5.5 #float
name="string" #string
is_student=True #boolean
## Type checking and conversion
type(height)
age=25
print(type(age))
#type conersion
age_str=str(age)
print(age_str)
print(type(age_str))
print("from integer to string converted")
<class 'int'> 25 <class 'str'> from integer to string converted
age="25"
print(age)
print(type(int(age)))
25 <class 'int'>
## Dynamic typing
## pythin allows the type of the vatiable to change as the program executes
var =10
print(var,type(var))
var="hello"
print(var,type(var))
var=5.11
print(var,type(var))
10 <class 'int'> hello <class 'str'> 5.11 <class 'float'>
## input
age=input("what is the age")
print(age,type(age))
height=float(input("what is the height")) #converting the value into float
print(height,type(height))
5 <class 'str'> 3.0 <class 'float'>
## simple calulator
num1=float(input("Enter the first number:"))
num2=float(input("Enter the second number:"))
sum1 = num1 + num2
difference = num1 - num2
multiply = num1 * num2
division = num1 * num2
print("sum is :", sum1)
print("Difference is :", difference)
print("Multiplication is :", multiply)
print("Division is :", division)
sum is : 8.0 Difference is : 4.0 Multiplication is : 12.0 Division is : 12.0
## common errors
# result="hello" + 5 #can only concatenate str (not "int") to str
# to remove the error use the type casting
result="hello" + str(5)
print(result)
hello5
a=10
b=5
add_result=a+b
sub_result=a-b
mult_result=a*b
div_result=a/b
floor_div_result=a//b #floor division = will remove the decimal
modulus_result=a%b #modulus operstion = reminder
exponent_result= a**b #exponentiation
print(add_result)
print(sub_result)
print(mult_result)
print(div_result)
print(floor_div_result)
print(modulus_result)
print(exponent_result)
15 5 50 2.0 2 0 100000
## comparision operators
## ==
a=10
b=11
print(a==b)
print(a!=b)
print(a<b)
print(a>b)
print(a<=b)
print(a>=b)
False True True False True False
##not equal to !=
str1="student"
str2="Student"
print(str1!=str2)
True
## And Not OR
Conditional statements
x,y=10.6,10.5
if x < y:
print("x is less than y")
else:
print("y is greater than x")
print("***********")
x1,y1=10.7,10.1
print("sum of x+y",x1+y1)
print("****if elif else*******")
if x1 < y1:
print("x1 is less than y1")
elif x1 == y1:
print("x1 is equal to y1")
else:
print("y1 is greater than x1")
y is greater than x *********** sum of x+y 20.799999999999997 ****if elif else******* y1 is greater than x1
## nested conditional statement
# check the num is even or odd or negative
num=int(input("Enter the number to check"))
if num>=0:
print("The number is positive")
if num%2==0:
print("the number is even")
else:
print("The numer is Odd")
else:
print("The number is either Zero or negative")
The number is positive The numer is Odd
#example
# check the leap year using nested loops
year=int(input("Enter the year"))
if (year%4==0):
if (year%100==0):
if (year%400==0):
print(year," is a leap year")
else:
print(year,"is not a leap year")
else:
print(year," is a leap year")
else:
print(year," is not a leap year")
1999 is not a leap year
x1,y1=10.7,10.1
x=2
print("****for*******")
###for
days =["mon","tue","wed","thu","fri","sat","sun"]
for d in days:
print(d)
****for******* mon tue wed thu fri sat sun
for i in range(5):
print(i)
print("next statemtns")
for i in range(1,6):
print(i)
print("next statemtns with steps of 2")
for i in range(1,10,2):
print(i)
0 1 2 3 4 next statemtns 1 2 3 4 5 next statemtns with steps of 2 1 3 5 7 9
print("**while*********")
####while
x1,y1=3,4
x=2
while x < x1:
print ("x",x)
x = x + 1
print("x+1",x)
print("**while*********")
count=0
while count<5:
print(count)
count=count+1
**while********* x 2 x+1 3 **while********* 0 1 2 3 4
##break statement
for i in range(10):
if i==5:
break
print(i)
0 1 2 3 4
## continue is to skip the current loop
for i in range(10):
if i%2==0:
continue
print(i)
1 3 5 7 9
### pass = it does nothing
for i in range(5):
if i==3:
print("the numner is",i)
pass
print(i)
0 1 2 the numner is 3 3 4
### nested loops
for i in range(3):
for j in range (2):
print(f"i:{i} and j:{j}")
i:0 and j:0 i:0 and j:1 i:1 and j:0 i:1 and j:1 i:2 and j:0 i:2 and j:1
##example - calulate the sum of the first N naturall numbers using a while and FOR loop
n=10
sum1=0
count=1
while count<=n:
sum1=sum1+count
count=count+1
print("sum of first 10 natural numbers", sum1)
## next loop FOR
for i in range(10):
sum1=sum1+i
print(sum1)
sum of first 10 natural numbers 55 100
#prime numbers
for num in range(1,101):
if num>1:
for i in range(2,num):
if num%i==0:
break
else:
print(num)
2 3 5 7 11 13 17 19 23 29 31 37 41 43 47 53 59 61 67 71 73 79 83 89 97
mytuple=(1,9,4,5,"gui")
myarray=[2,3,6,7,"iro"]
myset={10,47,58,435,4564,"set"}
mydict1={"one":1,3:"three",4.8:"deci",8:["four","good",1,90]}
print(type(mytuple))
print(type(myarray))
print(type(myset))
print(type(mydict1))
<class 'tuple'> <class 'list'> <class 'set'> <class 'dict'>
## Lists
myarray=[2,3,6,7,"iro","banana","cherry","bread",7.01]
print(myarray[1:])
print(myarray[1:5])
print(myarray[-1:])
[3, 6, 7, 'iro', 'banana', 'cherry', 'bread', 7.01] [3, 6, 7, 'iro'] [7.01]
## modifying lists
myarray[1]="car"
print(myarray[0:])
myarray[1:]="jelly"
print(myarray)
[2, 'car', 6, 7, 'iro', 'banana', 'cherry', 'bread', 7.01] [2, 'j', 'e', 'l', 'l', 'y']
myarray=[2,3,6,7,"iro","banana","cherry","bread",7.01]
myarray.append("motor12")
print("adding to the last using append",myarray)
myarray.insert(2,"Skyline")
print("adding to the specified index",myarray)
myarray.remove("iro")
print("removing to the first occurance",myarray)
myarray.reverse()
print("reversing the array", myarray)
adding to the last using append [2, 3, 6, 7, 'iro', 'banana', 'cherry', 'bread', 7.01, 'motor12'] adding to the specified index [2, 3, 'Skyline', 6, 7, 'iro', 'banana', 'cherry', 'bread', 7.01, 'motor12'] removing to the first occurance [2, 3, 'Skyline', 6, 7, 'banana', 'cherry', 'bread', 7.01, 'motor12'] reversing the array ['motor12', 7.01, 'bread', 'cherry', 'banana', 7, 6, 'Skyline', 3, 2]
myarray=[2,3,6,7,1,4,5,8,9]
print(myarray[0:])
print(myarray[1:5])
print(myarray[-1:])
print(myarray[::2])
print(myarray[::-1])
[2, 3, 6, 7, 1, 4, 5, 8, 9] [3, 6, 7, 1] [9] [2, 6, 1, 5, 9] [9, 8, 5, 4, 1, 7, 6, 3, 2]
myarray=[2,3,6,7,1,4,5,8,9]
for i,d in enumerate(myarray):
print(i,d)
0 2 1 3 2 6 3 7 4 1 5 4 6 5 7 8 8 9
##list comperhension
list1=[]
for x in range(10):
list1.append(x**2)
print(list1)
##next in single statement
[x**2 for x in range(10)]
[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
Basics Syntax [expresion for item in iterable]
with conditionl logic [expresion + for item in iterable + if condition]
square=[num**2 for num in range(10)]
print("Square root list is ",square)
print("even number in loop")
list2=[]
for i in range(10):
if i%2==0:
list2.append(i)
print(list2)
print("even number in list conprehension")
even_numbers=[num for num in range(10) if num%2==0]
print("even_numbers list is",even_numbers)
print("nested list comprehension")
lst1=[1,2,3,4,5]
lst2=['a','b','c','d']
pair=[[i,j] for i in lst1 for j in lst2]
print(pair)
Square root list is [0, 1, 4, 9, 16, 25, 36, 49, 64, 81] even number in loop [0, 2, 4, 6, 8] even number in list conprehension even_numbers list is [0, 2, 4, 6, 8] nested list comprehension [[1, 'a'], [1, 'b'], [1, 'c'], [1, 'd'], [2, 'a'], [2, 'b'], [2, 'c'], [2, 'd'], [3, 'a'], [3, 'b'], [3, 'c'], [3, 'd'], [4, 'a'], [4, 'b'], [4, 'c'], [4, 'd'], [5, 'a'], [5, 'b'], [5, 'c'], [5, 'd']]
### creating tuples
print("creating list using list command")
lst=list()
print(type(lst))
##############
print("###################")
print("creating tuple using tuple command")
tpl=tuple()
print(type(tpl))
##############
print("###################")
print("creating list directly")
lst1=[1,2,3,4,5]
print(type(list1))
print("creating tuple directly")
tpl1=(1,5,7,9,2)
print(type(tpl1))
##############
print("###################")
print("converting into tuple using list")
num9=tuple([1,2,3,4,5,67,9])
print(f" {type(num9)} is {num9}")
print("converting into LIST using tuple")
num10=list((1,2,3,4,5,6,7,8,9,0))
print(f" {type(num10)} is {num10}")
mixed_tppl1= (11,"hello","cat","Dog","Sun")
creating list using list command <class 'list'> ################### creating tuple using tuple command <class 'tuple'> ################### creating list directly <class 'list'> creating tuple directly <class 'tuple'> ################### converting into tuple using list <class 'tuple'> is (1, 2, 3, 4, 5, 67, 9) converting into LIST using tuple <class 'list'> is [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
### accessing tuples
tpl1=(1,5,7,9,2)
print(tpl1[2])
##############
print("###################")
concanetae_tpl1= tpl1 + mixed_tppl1
print(concanetae_tpl1)
7 ################### (1, 5, 7, 9, 2, 11, 'hello', 'cat', 'Dog', 'Sun')
tpl1=(1,5,7,9,2)
##tuple methods
tpl1=(1,5,7,9,2,3,4,6,8,29)
for i,d in enumerate(tpl1):
print(i,d)
##############
print("#######-tuple-methods-############")
print(tpl1.count(1)) # find the count of 1 presemt in the tuple
print(tpl1.index(29)) # finds the corresponding index number for the value
0 1 1 5 2 7 3 9 4 2 5 3 6 4 7 6 8 8 9 29 #######-tuple-methods-############ 1 9
## packing and un packing tuple
##############
print("######-packing-tuple#############")
packed_tpl=1,"hello",3.14
print(packed_tpl)
##############
print("######-unpacking-tuple#############")
a,b,c=packed_tpl
print(a)
print(b)
print(c)
######-packing-tuple############# (1, 'hello', 3.14) ######-unpacking-tuple############# 1 hello 3.14
## unpacking tuple woth star *
##############
print("########-unpacking-tuple-with-*-###########")
numers=(1,2,3,4,5,67,8,9,10,11)
first,*middle,last=numers
print(first)
print(middle)
print(last)
########-unpacking-tuple-with-*-########### 1 [2, 3, 4, 5, 67, 8, 9, 10] 11
#####-nested-List-#########
lst=[[1,2,3,4,5],[6,0,2,4,5,6],["hello","cat",90,45]]
print(lst[0])
print(lst[0][3])
print(lst[0][0:3])
print("#######-tuple-inside-the-list-############")
lst_tpl=[[1,2,3,4,5],[6,0,2,4,5,6],("hello","cat",90,45)]
print(lst_tpl[2])
[1, 2, 3, 4, 5] 4 [1, 2, 3] #######-tuple-inside-the-list-############ ('hello', 'cat', 90, 45)
#####-nested-tuples-#########
print("#####-nested-tuples-#########")
nested_tuple=((1,2,3),(90,34,56),("a","b","c"),(True,False))
print(nested_tuple[2])
print(nested_tuple[1][2])
print("#####-iteration-tuples-#########")
for sub_tpl in nested_tuple:
for item in sub_tpl:
print(item,end=" ")
print()
#####-nested-tuples-######### ('a', 'b', 'c') 56 #####-iteration-tuples-######### 1 2 3 90 34 56 a b c True False
|They are unordered, do not allow duplicate elements used in eleiminating duplicate entrys, perfroming mathematical set operations like union, intersection, difference and symmetric difference
######## Creating Set ######
print("######## Creating Set ######")
set1={1,2,3,4,56,7,8,9,0}
print(set1)
print(type(set1))
######## Creating Set ###### {0, 1, 2, 3, 4, 7, 8, 9, 56} <class 'set'>
print("######## ignoring duplicate element in set & using set command ######")
set2=set([1,2,3,4,56,7,8,9,0,1,2,3,89])
print(set2)
######## ignoring duplicate element in set & using set command ###### {0, 1, 2, 3, 4, 7, 8, 9, 56, 89}
######## Basi Set operations ######
set2=set([1,2,3,4,56,7,8,9,0,1,2,3,89])
print("######## adding a element - Basic Set operations ######")
set2.add(99) #adding a element
print(set2)
print("######## removing a element - Basic Set operations ######")
set2.remove(1) #removing a element
print(set2)
# print("######## removing a element not in the set- this will result in error######")
# set2.remove(1) #removing a element
# print(set2)
print("######## removing a element not in the set- but to ignore the error ######")
set2.discard(1) #removing a element
print(set2)
######## adding a element - Basic Set operations ###### {0, 1, 2, 3, 4, 99, 7, 8, 9, 56, 89} ######## removing a element - Basic Set operations ###### {0, 2, 3, 4, 99, 7, 8, 9, 56, 89} ######## removing a element not in the set- but to ignore the error ###### {0, 2, 3, 4, 99, 7, 8, 9, 56, 89}
### pop method
set2=set([1,2,3,4,56,7,8,9,0,1,2,3,89])
print(set2)
print("######## using pop method ######")
rem_element_pop=set2.pop()
print(rem_element_pop)
print(set2)
{0, 1, 2, 3, 4, 7, 8, 9, 56, 89} ######## using pop method ###### 0 {1, 2, 3, 4, 7, 8, 9, 56, 89}
## set memebership test
print("######## set memebership test ######")
my_set={1,2,3,4,45}
print(3 in my_set)
print(10 in my_set)
######## set memebership test ###### True False
## math operations
set1={1,2,3,4,5,6,7,8,9}
print(type(set1))
set2={9,3,4,5,21}
print("######## math operations: union ######")
union1_set=set1.union(set2)
print(union1_set)
print("######## math operations: intersection means common elements ######")
inter_set1=set1.intersection(set2)
print(inter_set1)
print("######## math operations: difference will remove common elements ######")
diff_set1=set1.difference(set2)
print(diff_set1)
print("######## math operations: symmetric will getr the unique elements ######")
symm_set1=set1.symmetric_difference(set2)
print(symm_set1)
<class 'set'> ######## math operations: union ###### {1, 2, 3, 4, 5, 6, 7, 8, 9, 21} ######## math operations: intersection means common elements ###### {9, 3, 4, 5} ######## math operations: difference will remove common elements ###### {1, 2, 6, 7, 8} ######## math operations: symmetric will getr the unique elements ###### {1, 2, 6, 7, 8, 21}
set1={1,2,3}
set2={3,4,5}
## is the subset
print(set1.issubset(set2))
print(set1.issuperset(set2))
set3={1,2,3,4,5}
set4={3,4,5}
## is the subset
print(set3.issubset(set4))
print(set3.issuperset(set4))
False False False True
##############
print("###### convert a list in to have only unique element or remove duplicates #############")
lst8=[1,1,12,2,2,3,3,3,4,5,6,7,7,7,8,9,0,12]
set(lst8)
###### convert a list in to have only unique element or remove duplicates #############
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12}
## counting unique words in text
text1="This lesson is about python baiscs. Python fundamentals like list, tuple, sets and dicitonary"
words=text1.split()
## convert list of the above words to set to get unique words
unique1=set(words)
print(unique1)
print(len(unique1))
{'lesson', 'is', 'Python', 'This', 'fundamentals', 'python', 'baiscs.', 'tuple,', 'about', 'sets', 'and', 'dicitonary', 'list,', 'like'} 14
unordered items key-value pairs
###### Dictionaries ########
print("######## empty list ###########")
lst=list()
print(type(lst))
print("######## empty tuple ###########")
tpl=tuple()
print(type(tpl))
print("######## empty set ###########")
set7=set()
print(type(set7))
print("######## Empty Dictionaries ###########")
num_dict={}
print(type(num_dict))
num_dict1=dict()
print(type(num_dict1))
######## empty list ########### <class 'list'> ######## empty tuple ########### <class 'tuple'> ######## empty set ########### <class 'set'> ######## Empty Dictionaries ########### <class 'dict'> <class 'dict'>
###### Dictionaries : Key Value ########
print("######## Dictionaries : Key Value ###########")
student1={"name":"Student","age":32,"grade":"A1"}
print(type(student1))
print(student1)
######## Dictionaries : Key Value ########### <class 'dict'> {'name': 'Student', 'age': 32, 'grade': 'A1'}
###### Dictionaries : Key Value : duplication ########
print('''######## Dictionaries : Key Value : : duplication : wont throw error,
but it will replace the later value ########### ''')
student1={"name":"Student","age":32,"grade":"A1","name":"HTC"}
print(type(student1))
print(student1)
######## Dictionaries : Key Value : : duplication : wont throw error, but it will replace the later value ########### <class 'dict'> {'name': 'HTC', 'age': 32, 'grade': 'A1'}
###### Dictionaries : Key Value : accessing value ########
print('''######## Dictionaries : Key/Value Dictionaries
: Key Value : accessing value ########### ''')
student1={"name":"Student","age":32,"grade":"A1","Area1":"HTC","last_name":"Doc"}
print(type(student1))
print(student1['name'])
print(student1['Area1'])
print('''######## Dictionaries : Key/Value : Dictionaries
: Key Value : accessing value using get ########### ''')
print(student1.get('age'))
print(student1.get('last_name'))
print('''######## Dictionaries: when the key is not present,
returns as empty ########### ''')
print(student1.get('phone'))
print('''######## Dictionaries: when the key is not present,
assigining default values ########### ''')
print(student1.get('phone',"not available"))
######## Dictionaries : Key/Value Dictionaries : Key Value : accessing value ########### <class 'dict'> Student HTC ######## Dictionaries : Key/Value : Dictionaries : Key Value : accessing value using get ########### 32 Doc ######## Dictionaries: when the key is not present, returns as empty ########### None ######## Dictionaries: when the key is not present, assigining default values ########### not available
###### Dictionaries : Key is always unique : Value : Modifing/ mutable ########
print('''######## Dictionaries : Key is always unique : Value
: Modifing/ mutable ########### ''')
student_new2={"name":"Student","age":32,"grade":"A1","Area1":"HTC","last_name":"Doc"}
print(student_new2)
print('''######## Dictionaries :
changing value for a key ########### ''')
student_new2["age"]=19
print(student_new2)
print('''######## Dictionaries :
adding new key/value ########### ''')
student_new2["address"]="ITC"
print(student_new2)
print('''######## Dictionaries :
deleting a key/value ########### ''')
del student_new2["last_name"]
print(student_new2)
######## Dictionaries : Key is always unique : Value : Modifing/ mutable ########### {'name': 'Student', 'age': 32, 'grade': 'A1', 'Area1': 'HTC', 'last_name': 'Doc'} ######## Dictionaries : changing value for a key ########### {'name': 'Student', 'age': 19, 'grade': 'A1', 'Area1': 'HTC', 'last_name': 'Doc'} ######## Dictionaries : adding new key/value ########### {'name': 'Student', 'age': 19, 'grade': 'A1', 'Area1': 'HTC', 'last_name': 'Doc', 'address': 'ITC'} ######## Dictionaries : deleting a key/value ########### {'name': 'Student', 'age': 19, 'grade': 'A1', 'Area1': 'HTC', 'address': 'ITC'}
### Dictionary methods
print("######-Dictionary methods-#############")
keys1=student_new2.keys()
print("get all keys",keys1)
values1=student_new2.values()
print("get all values",values1)
items1=student_new2.items()
print("get all keys/values in pair",items1)
######-Dictionary methods-############# get all keys dict_keys(['name', 'age', 'grade', 'Area1', 'address']) get all values dict_values(['Student', 19, 'A1', 'HTC', 'ITC']) get all keys/values in pair dict_items([('name', 'Student'), ('age', 19), ('grade', 'A1'), ('Area1', 'HTC'), ('address', 'ITC')])
### shallow copy - new do this mistake
student_new2={"name":"Student","age":32,"grade":"A1","Area1":"HTC","last_name":"Doc"}
## normal copy function
student_new2_copy=student_new2
print(student_new2)
print(student_new2_copy)
print(" ")
### ## normal copy function, when a value of the key is update both the variables are updated
print(''' ## normal copy function,
when a value of the key is updated,
both the variables are updated''')
student_new2["name"]="Devilarea"
print(student_new2)
print(student_new2_copy)
print(" ")
#### shallow copy, will not update both the variable
print(''' # shallow copy, will not update
both the variable # ''')
student_new2=student_new2.copy()
print("shallow copy done",student_new2)
print("shallow copy done",student_new2_copy)
print(" ")
student_new2["name"]="Lion"
print(student_new2)
print(" ")
print(student_new2_copy)
{'name': 'Student', 'age': 32, 'grade': 'A1', 'Area1': 'HTC', 'last_name': 'Doc'} {'name': 'Student', 'age': 32, 'grade': 'A1', 'Area1': 'HTC', 'last_name': 'Doc'} ## normal copy function, when a value of the key is updated, both the variables are updated {'name': 'Devilarea', 'age': 32, 'grade': 'A1', 'Area1': 'HTC', 'last_name': 'Doc'} {'name': 'Devilarea', 'age': 32, 'grade': 'A1', 'Area1': 'HTC', 'last_name': 'Doc'} # shallow copy, will not update both the variable # shallow copy done {'name': 'Devilarea', 'age': 32, 'grade': 'A1', 'Area1': 'HTC', 'last_name': 'Doc'} shallow copy done {'name': 'Devilarea', 'age': 32, 'grade': 'A1', 'Area1': 'HTC', 'last_name': 'Doc'} {'name': 'Lion', 'age': 32, 'grade': 'A1', 'Area1': 'HTC', 'last_name': 'Doc'} {'name': 'Devilarea', 'age': 32, 'grade': 'A1', 'Area1': 'HTC', 'last_name': 'Doc'}
### iteration Dicitonaries
print("#iterate keys only")
for keys3 in student_new2.keys():
print(keys3)
print("#")
print("#iterate values only ")
for value3 in student_new2.values():
print(value3)
print("#")
## iterate key value pairs
print("#iterate keys/values")
for key4,value4 in student_new2.items():
print(f"{key4}:{value4}")
print("#")
#iterate keys only name age grade Area1 last_name # #iterate values only Lion 32 A1 HTC Doc # #iterate keys/values name:Lion age:32 grade:A1 Area1:HTC last_name:Doc #
### nested dictonaries are like mongo db
student_nest1={
"Student1":{"name":"Thor","age":32,"grade":"A1"},
"Student2":{"name":"Loki","age":23,"grade":"B1"}
}
print(student_nest1)
{'Student1': {'name': 'Thor', 'age': 32, 'grade': 'A1'}, 'Student2': {'name': 'Loki', 'age': 23, 'grade': 'B1'}}
## access nested dictonaries
print("access nested dictonaries")
print(student_nest1["Student1"]["age"])
print(student_nest1["Student1"]["name"])
print("##")
print(student_nest1.items())
print("##")
access nested dictonaries 32 Thor ## dict_items([('Student1', {'name': 'Thor', 'age': 32, 'grade': 'A1'}), ('Student2', {'name': 'Loki', 'age': 23, 'grade': 'B1'})]) ##
### iternation over neset dictonaries
for student_id,student_info in student_nest1.items():
print(f"{student_id}:{student_info}")
for key,value in student_info.items():
print(f"{key}:{value}")
Student1:{'name': 'Thor', 'age': 32, 'grade': 'A1'} name:Thor age:32 grade:A1 Student2:{'name': 'Loki', 'age': 23, 'grade': 'B1'} name:Loki age:23 grade:B1
## dictonary comprehension
squares1={x:x**2 for x in range(5)}
print(squares1)
{0: 0, 1: 1, 2: 4, 3: 9, 4: 16}
## condition dictonary comprehension
events1={ x:x**2 for x in range(10) if x%2==0 }
print(events1)
{0: 0, 2: 4, 4: 16, 6: 36, 8: 64}
### practical examples
## use a dictonary to count the frequency of the elements in the list
number_freq1= [1,1,1,1,3,3,3,4,4,5,6,6,5,7,8,9,0,0,0,0,5]
frequency1={}
for num11 in number_freq1:
if num11 in frequency1:
frequency1[num11]+=1
else:
frequency1[num11]=1
print(frequency1)
{1: 4, 3: 3, 4: 2, 5: 3, 6: 2, 7: 1, 8: 1, 9: 1, 0: 4}
## merge 2 dictionaries
dict1={"a":1,"b":2}
dict2={"c":3,"d":4}
merged_dict1={**dict1,**dict2}
print(merged_dict1)
{'a': 1, 'b': 2, 'c': 3, 'd': 4}
## create to do list
to_do_list=["Buy groveries","clean the house","pay bills"]
#adding to to_do_list
to_do_list.append("Schedule meeting")
to_do_list.append("Go for a run")
#removing a completed task
to_do_list.remove("clean the house")
## checking if the task is in the list
if "pay bills" in to_do_list:
print("Dont forget to pay bills")
print("To Do list remaining")
for task in to_do_list:
print(f"-{task}")
Dont forget to pay bills To Do list remaining -Buy groveries -pay bills -Schedule meeting -Go for a run
Student grades
#organizing grads
grades=[85,90,45,75,65,80]
#apprnding grades
grades.append(77)
#calculating average
average_grades = sum (grades) / len(grades)
print(average_grades)
print(f"Average grades:{average_grades}")
#finding highest and lowest grades
highest_grades=max(grades)
lowest_grades=min(grades)
print(f"Higest grades:{highest_grades}")
print(f"Lowest grades:{lowest_grades}")
73.85714285714286 Average grades:73.85714285714286 Higest grades:90 Lowest grades:45
import math
average_grades=0
grades=[85,90,45,75,65,80]
average_grades=sum(grades)
print(average_grades)
440
#manage inventry
inventery1=["apples","bananas","oranges","peaches"]
#adding new item
inventery1.append("blueberrues")
#removing aitem
inventery1.remove("bananas")
#checking items in stock
item1="oranges"
if item1 in inventery1:
print(f"{item1} are in stock")
else:
print(f"{item1} are out of stock")
#printing inventery
print("Inventory lists:")
for item1 in inventery1:
print(f"-{item1}")
oranges are in stock Inventory lists: -apples -oranges -peaches -blueberrues
is a block of code reusing the code improving readability
## syntax
def function_name1(parameters1):
""" DocString : what this function does"""
#function body
return #expression
## functions example
def even_or_odd_fun(num1):
''' This function finds num is even or Odd'''
if num1%2==0:
print(f"{num1}:the number is even")
else:
print(f"{num1}:The number is odd")
even_or_odd_fun(11)
11:The number is odd
## function with multiple parameters
def add1(a1,b1):
c1=a1+b1
return c1
##
reesult1=add1(11,33)
print(reesult1)
######### another way of writing
def add1(a1,b1):
return a1+b1
##
reesult2=add1(13,53)
print(reesult2)
44 66
## default parameters
def greet1(name1):
print(f"Helo {name1}")
##calling the fun
greet1("student")
## assiging default parameter
def greet1(name1="Guest"):
print(f"Helo {name1}")
##calling the fun
greet1()
Helo student Helo Guest
## variable leng arguments
## positional and keyword arguments
###
## positional_arguments
def print_number1(*args):
''' Instead of defining multiple arguments,
just denoting by "*" then arugument name.
'''
for numbers34 in args:
print(numbers34)
print_number1(1,2,3,4,5,6,7,8,90,"Hello")
1 2 3 4 5 6 7 8 90 Hello
#keywords_arguments
def print_details1(**kwargs):
'''All the arguments will be
in the form of
key-value pairs
'''
for key1,value1 in kwargs.items():
print(f"{key1}:{value1}")
print_details1(name1="student",Age1=34,Place1="NTU")
name1:student Age1:34 Place1:NTU
## combination of both positional and keyword arguments
def print_details2(*args,**kwargs):
'''combination of both
positional and
keyword arguments
'''
for val in args:
print(f"Postinonal arguments :{val}")
for key1,value1 in kwargs.items():
print(f"key/Value pair arguments : {key1}:{value1}")
print_details2(1,2,3,name1="student",Age1=34,Place1="NTU")
Postinonal arguments :1 Postinonal arguments :2 Postinonal arguments :3 key/Value pair arguments : name1:student key/Value pair arguments : Age1:34 key/Value pair arguments : Place1:NTU
## retur multiple parameters
def multiply_1(a1,b1):
return a1*b1,a1
multiply_1(234,498657)
(116685738, 234)
def addition1(a1,b1):
return a1+b1
print(addition1(2,6))
## converting the above in to lambda
addtion2= lambda a1,b1:a1+b1
type(addtion2)
print(addtion2(4,5))
8 9
def even3(num3):
if num3%2==0:
return True
print(f"this number is even : {even3(22)}")
##converting above into lambda func
even4=lambda num3:num3%2==0
even4(12)
this number is even : True
True
def addition3(x,y,z):
return x+y+z
print(addition3(3,5,9))
###using lambda func
addition4=lambda x,y,z:x+y+z
addition4(12,13,5)
17
30
map() func with lambda () func
MAP() Function applies a given function to all the items in an input or any other iterable. useful for transforming data in a list.
##map
numbers5=[1,2,3,4,56,7]
#creating a function
def square23(numbers6):
return numbers6**2
print(square23(5))
## now using lambda & map to get each items in the list to get executed
list(map(lambda x:x**2,numbers5))
25
[1, 4, 9, 16, 3136, 49]
def square25(x):
return x*x
square25(10)
100
numbers56=[1,4,6,8,9,11,22,33]
#perfroming all actions on the above list
list(map(square25,numbers56))
[1, 16, 36, 64, 81, 121, 484, 1089]
## lambda func with map
numbers33=[1,3,5,7,9]
list(map(lambda x:x*x,numbers33))
[1, 9, 25, 49, 81]
## map func with multiple iterables
number12=[1,2,3]
numbers13=[4,5,6]
added_numbers1=list(map(lambda x,y:x+y,number12,numbers13))
print(added_numbers1)
[5, 7, 9]
## map() to convert a list of strings to integers
str_numbers=['1','2','3']
int_numbers=list(map(int, str_numbers))
print(int_numbers)
[1, 2, 3]
words12=['apple','banana','grapes']
upper_words=list(map(str.upper,words12))
print(upper_words)
['APPLE', 'BANANA', 'GRAPES']
def get_name(person1):
return person1['name']
people1=[
{'name':'jack','age':32},
{'name':'jon','age':'34'}
]
list(map(get_name,people1))
['jack', 'jon']
used to filter items on the list or iterable based on a condition
def even5(num44):
if num44%2==0:
return True
print(even5(24))
# using filter func with the condititon "num44%2==0" just even numbers alone
list2=[1,2,3,4,5,6,7,8,9,10]
list(filter(even5,list2))
True
[2, 4, 6, 8, 10]
# filter with lambda
numbers21=[1,2,3,4,5,6,7,8,9,10]
greater_than_5=list(filter(lambda x:x>5, numbers21))
print(greater_than_5)
[6, 7, 8, 9, 10]
### filter with lambda func and multipole conditions
numbers21=[1,2,3,4,5,6,7,8,9,10]
even_greater_than_5=list(filter(lambda x:x>5 and x%2==0,numbers21))
print(even_greater_than_5)
[6, 8, 10]
### filter() to check if age is greater than 25
people1=[
{'name':'jack','age':32},
{'name':'jon','age':34},
{'name':'jill','age':18},
{'name':'zac','age':24}
]
def age_greater_than_25(person1):
return person1['age']>25
list(filter(age_greater_than_25,people1))
[{'name': 'jack', 'age': 32}, {'name': 'jon', 'age': 34}]
Modules and packages in python import math from math import sqrt,pi import numpy as np from math import *
import math
math.sqrt(16)
from math import sqrt,pi
print(sqrt(99))
print(sqrt(25))
print(pi)
9.9498743710662 5.0 3.141592653589793
installing required libraries using the single txt file F:\0007Python-files\1-Py-Basics> pip install -r .\requirement.txt
import numpy as np
np.array([1,2,3,4])
array([1, 2, 3, 4])
from math import *
## importing everything from the math
create a folder - package then create a file name with "init.py" then create the package name and their modules
from package.maths1 import addition_module1
addition_module1(23,45)
68
from package import maths1
addition_module1(2,45)
47
from package.maths1 import addition_module1
print(addition_module1(12,7))
19
from package.maths1 import *
from package.subpackages.mult1 import multify_module1
print(maths1.addition_module1(12,7))
#print(maths1.substract_module1(3,1))
print(multify_module1(2,9))
19 18
Standard Library Overview
import array
arr1=array.array('i',[1,2,3,4,5])
print(arr1)
array('i', [1, 2, 3, 4, 5])
## random
import random
print(random.randint(1,10)) #random integer b/w 1 to 10
print(random.choice(['apple','cherry','lemon']))
3 cherry
## file and directory access
import os
print(os.getcwd()) #get current working directory
#os.mkdir('test_dir')
f:\0007Python-files\1-Py-Basics
## high level operations on the files and collection of files
import shutil
shutil.copyfile('101_Python_Basics.ipynb','test-read-me.txt')
'test-read-me.txt'
#data serialization
import json
data1={'name':'Student1','age':32}
json_str1=json.dumps(data1)
print(json_str1)
print(type(json_str1))
###
parsed_data1=json.loads(json_str1)
print(parsed_data1)
print(type(parsed_data1))
{"name": "Student1", "age": 32} <class 'str'> {'name': 'Student1', 'age': 32} <class 'dict'>
### CSV
import csv
with open('example.csv',mode='w',newline='') as file:
writer1=csv.writer(file)
writer1.writerow(['name','age'])
writer1.writerow(['krish',32])
with open('example.csv',mode='r') as file:
reader1=csv.reader(file)
for now in reader1:
print(now)
#
['name', 'age'] ['krish', '32']
##date time
from datetime import datetime,timedelta
now1=datetime.now()
print(now1)
yeaster1=now1-timedelta(days=1)
print(yeaster1)
2025-07-14 06:58:35.700002 2025-07-13 06:58:35.700002
## time
import time
print(time.time())
time.sleep(2)
print(time.time())
1752472715.7190876 1752472717.7197917
## regular expression
import re
pattern1=r'\d+'
text1='There are 123 apples'
match1=re.search(pattern1,text1)
print(match1.group())
123
read and write from files, both text and binary files
### read a whole file
with open ('example.txt','r') as file:
content1=file.read()
print(content1)
New text is written and erasing enitre file this is a new line
## read the file line by line
with open('example.txt','r') as file:
for line in file:
#print(line)
print(line.strip()) ## strip() removes new line characters
New text is written and erasing enitre file this is a new line
### writing a file(overwriting)
with open('example.txt','w') as file: #'w' is write method replacing exixting contents
file.write('ending the comments\n')
file.write('Entire file is overwritten by replace all the contents')
### Appending mode = "a"
with open('example.txt','a') as file:
file.write(" \n appending with exixting contents # creating evironment #conda create -p venv python==3.12\n ")
## writing multipole lines ="a" "writelines"
lines1=['First line \n','second line \n','third line \n']
with open('example.txt','a') as file:
file.writelines(lines1)
### binary files
#writing to a binary files = 'wb'
data1=b'\x01\x01\x02\x03\x04'
with open('test1.bin','wb') as file:
file.write(data1)
#reading to a binary files = "rb"
with open('test1.bin','rb') as file:
content2=file.read()
print(content2)
b'\x01\x01\x02\x03\x04'
### read the content from the source file and write it to a destination file
with open('101_Python_Basics.ipynb','r') as source_file1:
content3=source_file1.read()
with open('example.txt','a') as destination_file1:
destination_file1.write(content3)
#### writing and then reading the file
with open('example.txt','w+') as file:
file.write("New text is written and erasing enitre file\n")
file.write("this is a new line\n")
'''
#### MOVE the file cursor to the beginning
or ther wise the read func will start, reading from the end of the above written lines
'''
file.seek(0) ## move to the zero index
## read the content of the file
content4=file.read()
print(content4)
New text is written and erasing enitre file this is a new line
## os.mkdir
items_dir=os.listdir('.')
print(items_dir)
['101_Power_BI.txt', '101_Python_Basics.ipynb', '101_SQL_Basics.ipynb', 'All_Regions_RAW.xlsx', 'data.csv', 'df_excel1', 'example.csv', 'example.txt', 'example1.txt', 'flight_business.csv', 'flight_cleaned_v_01.csv', 'flight_Clean_Dataset.csv', 'flight_economy.csv', 'flight_price.xlsx', 'googleplaystore.csv', 'google_cleaned_v_01.csv', 'package', 'Python-Basics-101-markdown.py', 'requirement.txt', 'test-read-me.txt', 'test1.bin', 'winequality-red.csv']
### joining paths
dir_name1="folder"
file_name1="file.txt"
relative_path1=os.path.join(dir_name1,file_name1)
print(relative_path1)
full_path1=os.path.join(os.getcwd(),dir_name1,file_name1)
print(full_path1)
folder\file.txt f:\0007Python-files\1-Py-Basics\folder\file.txt
### checlking a folder or path exists
path2='test1.bin'
if os.path.exists(path2):
print(f"the path '{path2}' exists")
else:
print(f"the path {path2} dosent exists")
the path 'test1.bin' exists
### checking if the path is a file or directory
import os
path5='example.txt'
if os.path.isfile(path5):
print(f"the path '{path5}' is a file")
elif os.path.isdir(path5):
print(f"the path '{path5}' is a dir")
else:
print(f"the path '{path5}' either a file or directory")
##
the path 'example.txt' is a file
### getting absolute path
relative_path2="example.csv"
absolute_path2=os.path.abspath(relative_path2)
print(absolute_path2)
f:\0007Python-files\1-Py-Basics\example.csv
Exception handling are event that distrub the normal flow of the program:
they occur when an error is encountered during program execution.
ZeroDivisionError
FileNotFoundError
ValueError= invalid value
TypeError = invalid type
### Try, except block
try:
a88=b88
except:
print("the variable is not assigined")
the variable is not assigined
## a88=b88 ### error is thrown
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[567], line 1 ----> 1 a88=b88 ### error is thrown NameError: name 'b88' is not defined
try:
a88=b88
except:
print("the variable is not assigined")
the variable is not assigined
try:
a88=b88
except NameError as ex: ### adding the exception error
print(ex)
name 'b88' is not defined
### throwing error ZeroDivisionError: division by zero
# resutl3=1/0
try:
resutl_33=1/0
except ZeroDivisionError as ex:
print(ex)
print("Enter denominater greater than Zero")
division by zero Enter denominater greater than Zero
try:
resutl_32=1/0
a89=c89
except ZeroDivisionError as ex:
print(ex)
print("Enter denominater greater than Zero")
division by zero Enter denominater greater than Zero
### all the exception error is handled by the closs "excelption"
try:
resutl_31=1/0
a_89=b_89
except ZeroDivisionError as ex:
print(ex)
print("Enter denominater greater than Zero")
except Exception as ex1: ######## this exception needs to be last written
print(ex1)
print("main exception is caught")
division by zero Enter denominater greater than Zero
try:
num_90=int(input("enter a number"))
result=10/num_90
except ValueError:
print("this is not a valid number")
except ZeroDivisionError:
print("enter denominaor greater than Zero")
except Exception as ex:
print(ex)
### try, except else
try:
num_91=int(input("enter a number"))
result_91=10/num_91
except ValueError:
print("this is not a valid number")
except ZeroDivisionError:
print("enter denominaor greater than Zero")
except Exception as ex:
print(ex)
else:
print(f"the result is {result_91}") ###### when there is no exception error , the results get published
the result is 0.18518518518518517
###### try,except, else , finally
try:
num_93=int(input("enter a number"))
result_93=10/num_93
except ValueError:
print("this is not a valid number")
except ZeroDivisionError:
print("enter denominaor greater than Zero")
except Exception as ex:
print(ex)
else:
print(f"the result is {result_91}")
finally:
print("exceution is compoleted") ### finally will be exceuted always, irresptive
the result is 0.18518518518518517 exceution is compoleted
### file handling and exception handling
try:
file_name2=open('example1.txt')
content5=file_name2.read()
a67=b67
print(content5)
except FileNotFoundError:
print("the file does not exists")
except Exception as ex:
print(ex)
finally:
if 'file_name2' in locals() or not file_name2.closed(): ### change the value "or not" to "and not" if file dosent exists
file_name2.close()
print('file close')
name 'b67' is not defined file close
supports on arrays and matrices, vectorized operatrions
import numpy as np
## create a array using numpy
## create 1D array
arr_1=np.array([1,2,3,4,5,6])
print(arr_1)
print(type(arr_1))
print(arr_1.shape)
[1 2 3 4 5 6] <class 'numpy.ndarray'> (6,)
arr_2=np.array([1,2,3,4,5])
arr_2.reshape(1,5) ## 1 row and 5 columns
array([[1, 2, 3, 4, 5]])
## 2D array
arr_2=np.array([[1,2,3,4,5],[6,7,8,9,10]])
print(arr_2)
print(arr_2.shape)
[[ 1 2 3 4 5] [ 6 7 8 9 10]] (2, 5)
np.arange(0,10,2).reshape(5,1)
array([[0], [2], [4], [6], [8]])
np.ones([3,4]) ### 3 rows with 4 columns with 1 as each value
array([[1., 1., 1., 1.], [1., 1., 1., 1.], [1., 1., 1., 1.]])
### Identity matrix = diognals are ones
np.eye(3)
array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
### attributes of numpy array
arr_2=np.array([[1,2,3,4,5],[6,7,8,9,10]])
print("array:\n",arr_2)
print("shape",arr_2.shape)
print("Number of dimentions:",arr_2.ndim)
print("size(number of ellements):", arr_2.size)
print("Data type:", arr_2.dtype)
print("Item size(in bytes):", arr_2.itemsize)
array: [[ 1 2 3 4 5] [ 6 7 8 9 10]] shape (2, 5) Number of dimentions: 2 size(number of ellements): 10 Data type: int64 Item size(in bytes): 8
### numpy vetorized operations
arr_3=np.array([1,2,3,4,5,6])
arr_4=np.array([10,20,30,40,50,60])
## elements wise additon
print("Addition",arr_3+arr_4)
print("substraction",arr_3-arr_4)
Addition [11 22 33 44 55 66] substraction [ -9 -18 -27 -36 -45 -54]
### unversal functions
arr_5=np.array([1,2,3,4,5])
## square root
print(np.sqrt(arr_5))
print(np.exp(arr_5))
print(np.log(arr_5))
[1. 1.41421356 1.73205081 2. 2.23606798] [ 2.71828183 7.3890561 20.08553692 54.59815003 148.4131591 ] [0. 0.69314718 1.09861229 1.38629436 1.60943791]
#### array slicing and indexing
arry_6=np.array([[1,2,3,4,5],[6,7,8,9,10],[11,12,13,14,15]])
print('array\n',arry_6)
array [[ 1 2 3 4 5] [ 6 7 8 9 10] [11 12 13 14 15]]
## accessing a paticular element in a row column
print(arry_6[0][0])
print("#")
print(arry_6[1:])
print("#")
print(arry_6[0:2])
print("#")
print(arry_6[0:3,3:])
print("#")
print(arry_6[0:1,2:])
1 # [[ 6 7 8 9 10] [11 12 13 14 15]] # [[ 1 2 3 4 5] [ 6 7 8 9 10]] # [[ 4 5] [ 9 10] [14 15]] # [[3 4 5]]
arry_6[1:,2:]
array([[ 8, 9, 10], [13, 14, 15]])
### practical
## ststistical concepts--- Normilaztion
###
data_arr1=np.array([1,2,3,4,5])
##calculate mean and Std Deviation
mean_data_arr1=np.mean(data_arr1)
print("mean is,",mean_data_arr1)
std_dev_data_arr1=np.std(data_arr1)
print("SD is,",std_dev_data_arr1)
variance_data_arr1=np.var(data_arr1)
print("variance is:", variance_data_arr1)
## normalization
normalize_data_arr1=(data_arr1 - mean_data_arr1) / std_dev_data_arr1
print("Normalization is ",normalize_data_arr1)
mean is, 3.0 SD is, 1.4142135623730951 variance is: 2.0 Normalization is [-1.41421356 -0.70710678 0. 0.70710678 1.41421356]
### logical operations
data_arry_7=np.array([[1,2,3,4,5],[6,7,8,9,10],[11,12,13,14,15]])
print(data_arry_7>5)##gives true or false
print(data_arry_7[data_arry_7>5]) #values
print(data_arry_7[(data_arry_7>=5) & (data_arry_7<= 10)])
[[False False False False False] [ True True True True True] [ True True True True True]] [ 6 7 8 9 10 11 12 13 14 15] [ 5 6 7 8 9 10]
Pandas - Data Frame and Series used for data analysis and data cleaning two priymary data structures - Series & DataFrame series = 1 D array DataFrames = 2D array & hetrogenious
import pandas as pd
## series
data_pand1=[1,2,3,4,5]
series_pand1=pd.Series(data_pand1)
print('series \n',series_pand1)
print(type(series_pand1))
print(type(data_pand1))
series 0 1 1 2 2 3 3 4 4 5 dtype: int64 <class 'pandas.core.series.Series'> <class 'list'>
## create a series from dictonary
data_pand2_dict={'a':1,'b':2,'c':3}
series_pand1=pd.Series(data_pand2_dict)
print(series_pand1)
print(type(series_pand1))
print(type(data_pand2_dict))
a 1 b 2 c 3 dtype: int64 <class 'pandas.core.series.Series'> <class 'dict'>
data_pand2=[10,20,30]
index_pand2=['a','b','c']
pd.Series(data_pand2,index_pand2)
a 10 b 20 c 30 dtype: int64
### data frame is multiple rows and columns
### create a Dataframe from a dictionary of list
data_frame1= {
'name':['jack','jhon','dave'],
'age':[25,34,40],
'city':['blr','uk','lon']
}
df_data_frame1=pd.DataFrame(data_frame1)
print(df_data_frame1)
print(type(df_data_frame1))
name age city 0 jack 25 blr 1 jhon 34 uk 2 dave 40 lon <class 'pandas.core.frame.DataFrame'>
import numpy as np
np.array(df_data_frame1)
array([['jack', 25, 'blr'], ['jhon', 34, 'uk'], ['dave', 40, 'lon']], dtype=object)
### create a data from a list of dictionaries
data_frame2= [
{'name': 'jack','age':25,'city':'blr'},
{'name': 'jon','age':55,'city':'kkr'},
{'name': 'dave','age':45,'city':'rr'},
{'name': 'mike','age':35,'city':'csk'},
]
df_data_frame2=pd.DataFrame(data_frame2)
print(df_data_frame2)
print(type(df_data_frame2))
name age city 0 jack 25 blr 1 jon 55 kkr 2 dave 45 rr 3 mike 35 csk <class 'pandas.core.frame.DataFrame'>
###reading csv
df_data_csv1=pd.read_csv('data.csv')
#print(df_data_csv1.head(5))
df_data_csv1.head(5)
Date | Category | Value | Product | Sales | Region | |
---|---|---|---|---|---|---|
0 | 2023-01-01 | A | 28.0 | Product1 | 754.0 | East |
1 | 2023-01-02 | B | 39.0 | Product3 | 110.0 | North |
2 | 2023-01-03 | C | 32.0 | Product2 | 398.0 | East |
3 | 2023-01-04 | B | 8.0 | Product1 | 522.0 | East |
4 | 2023-01-05 | B | 26.0 | Product3 | 869.0 | North |
data_frame1= {
'name':['jack','jhon','dave'],
'age':[25,34,40],
'city':['blr','uk','lon']
}
df_data_frame1=pd.DataFrame(data_frame1)
print(type(df_data_frame1))
print("###")
print(df_data_frame1['name'])
print("###")
print(type(df_data_frame1['name']))
<class 'pandas.core.frame.DataFrame'> ### 0 jack 1 jhon 2 dave Name: name, dtype: object ### <class 'pandas.core.series.Series'>
df_data_frame1.loc[0] ##row
name jack age 25 city blr Name: 0, dtype: object
df_data_frame1.iloc[1] ##column
name jhon age 34 city uk Name: 1, dtype: object
### accessing a specified element
print(df_data_frame1.at[1,'age']) ## just pick a single element from the table
print(df_data_frame1.iat[2,2]) ## row & column index
34 lon
### data manauplation with data frames
## add new column
df_data_frame1['salary']=[50000,60000,78000]
#print(df_data_frame1)
df_data_frame1
name | age | city | salary | |
---|---|---|---|---|
0 | jack | 25 | blr | 50000 |
1 | jhon | 34 | uk | 60000 |
2 | dave | 40 | lon | 78000 |
##remove a column
##KeyError: "['salary'] not found in axis"
# df_data_frame1.drop('salary')
### temperary dropping
print(df_data_frame1)
print("###\n")
print("afer removing column\n 'dropping'\n")
print(df_data_frame1.drop('salary',axis=1) )## making axis value to 1
print("###\n")
print("# again printing 'df_data_frame1'\n the drop is 'temperary' ##")
print(df_data_frame1)
name age city salary 0 jack 25 blr 50000 1 jhon 34 uk 60000 2 dave 40 lon 78000 ### afer removing column 'dropping' name age city 0 jack 25 blr 1 jhon 34 uk 2 dave 40 lon ### # again printing 'df_data_frame1' the drop is 'temperary' ## name age city salary 0 jack 25 blr 50000 1 jhon 34 uk 60000 2 dave 40 lon 78000
###permanent dropping by adding inplace = true
print("# Data ##\n")
data_frame1= {
'name':['jack','jhon','dave'],
'age':[25,34,40],
'city':['blr','uk','lon']
}
df_data_frame1=pd.DataFrame(data_frame1)
print("# Adding ##\n")
df_data_frame1['salary']=[50000,60000,78000]
print("###\n")
print(df_data_frame1)
print("# Dropping ##\n")
print("adding inplace=true\n")
df_data_frame1.drop('salary',axis=1, inplace=True) ## inplace = true
print("###\n")
print("# again printing 'df_data_frame1'\n after executing inplace=true\n")
print(df_data_frame1)
# Data ## # Adding ## ### name age city salary 0 jack 25 blr 50000 1 jhon 34 uk 60000 2 dave 40 lon 78000 # Dropping ## adding inplace=true ### # again printing 'df_data_frame1' after executing inplace=true name age city 0 jack 25 blr 1 jhon 34 uk 2 dave 40 lon
### add age to the column and increase
df_data_frame1['age']=df_data_frame1['age']+1
df_data_frame1
name | age | city | |
---|---|---|---|
0 | jack | 26 | blr |
1 | jhon | 35 | uk |
2 | dave | 41 | lon |
##### row drop
print("# Data ##\n")
data_frame1= {
'name':['jack','jhon','dave'],
'age':[25,34,40],
'city':['blr','uk','lon']
}
df_data_frame1=pd.DataFrame(data_frame1)
###
print("###\n")
df_data_frame1.drop(0,inplace=True)
df_data_frame1
# Data ## ###
name | age | city | |
---|---|---|---|
1 | jhon | 34 | uk |
2 | dave | 40 | lon |
###reading csv
df_data_csv1=pd.read_csv('data.csv')
#print(df_data_csv1.head(5))
df_data_csv1.head(5)
###
print("Data types\n",df_data_csv1.dtypes)
###
print("###\n")
print("Statistical summary:\n",df_data_csv1.describe())
Data types Date object Category object Value float64 Product object Sales float64 Region object dtype: object ### Statistical summary: Value Sales count 47.000000 46.000000 mean 51.744681 557.130435 std 29.050532 274.598584 min 2.000000 108.000000 25% 27.500000 339.000000 50% 54.000000 591.500000 75% 70.000000 767.500000 max 99.000000 992.000000
## datamanuplation
###reading csv
df_data_csv1=pd.read_csv('data.csv')
df_data_csv1.head(5)
Date | Category | Value | Product | Sales | Region | |
---|---|---|---|---|---|---|
0 | 2023-01-01 | A | 28.0 | Product1 | 754.0 | East |
1 | 2023-01-02 | B | 39.0 | Product3 | 110.0 | North |
2 | 2023-01-03 | C | 32.0 | Product2 | 398.0 | East |
3 | 2023-01-04 | B | 8.0 | Product1 | 522.0 | East |
4 | 2023-01-05 | B | 26.0 | Product3 | 869.0 | North |
df_data_csv1.describe()
Value | Sales | |
---|---|---|
count | 47.000000 | 46.000000 |
mean | 51.744681 | 557.130435 |
std | 29.050532 | 274.598584 |
min | 2.000000 | 108.000000 |
25% | 27.500000 | 339.000000 |
50% | 54.000000 | 591.500000 |
75% | 70.000000 | 767.500000 |
max | 99.000000 | 992.000000 |
df_data_csv1.dtypes
Date object Category object Value float64 Product object Sales float64 Region object dtype: object
#df_data_csv1.isnull() ### true means the element is empty
df_data_csv1.isnull().any()
Date False Category False Value True Product False Sales True Region False dtype: bool
df_data_csv1.isnull().sum()
'''### provides the total number of elements
are empty in each coulumn'''
'### provides the total number of elements \nare empty in each coulumn'
### filling missing values of the mean of the column
### copyin the column in to new column variable
df_data_csv1['Sales_fillNA']=df_data_csv1['Sales'].fillna(df_data_csv1['Sales'].mean())
print("new column added 'Sales_fillNA' ")
df_data_csv1
new column added 'Sales_fillNA'
Date | Category | Value | Product | Sales | Region | Sales_fillNA | |
---|---|---|---|---|---|---|---|
0 | 2023-01-01 | A | 28.0 | Product1 | 754.0 | East | 754.000000 |
1 | 2023-01-02 | B | 39.0 | Product3 | 110.0 | North | 110.000000 |
2 | 2023-01-03 | C | 32.0 | Product2 | 398.0 | East | 398.000000 |
3 | 2023-01-04 | B | 8.0 | Product1 | 522.0 | East | 522.000000 |
4 | 2023-01-05 | B | 26.0 | Product3 | 869.0 | North | 869.000000 |
5 | 2023-01-06 | B | 54.0 | Product3 | 192.0 | West | 192.000000 |
6 | 2023-01-07 | A | 16.0 | Product1 | 936.0 | East | 936.000000 |
7 | 2023-01-08 | C | 89.0 | Product1 | 488.0 | West | 488.000000 |
8 | 2023-01-09 | C | 37.0 | Product3 | 772.0 | West | 772.000000 |
9 | 2023-01-10 | A | 22.0 | Product2 | 834.0 | West | 834.000000 |
10 | 2023-01-11 | B | 7.0 | Product1 | 842.0 | North | 842.000000 |
11 | 2023-01-12 | B | 60.0 | Product2 | NaN | West | 557.130435 |
12 | 2023-01-13 | A | 70.0 | Product3 | 628.0 | South | 628.000000 |
13 | 2023-01-14 | A | 69.0 | Product1 | 423.0 | East | 423.000000 |
14 | 2023-01-15 | A | 47.0 | Product2 | 893.0 | West | 893.000000 |
15 | 2023-01-16 | C | NaN | Product1 | 895.0 | North | 895.000000 |
16 | 2023-01-17 | C | 93.0 | Product2 | 511.0 | South | 511.000000 |
17 | 2023-01-18 | C | NaN | Product1 | 108.0 | West | 108.000000 |
18 | 2023-01-19 | A | 31.0 | Product2 | 578.0 | West | 578.000000 |
19 | 2023-01-20 | A | 59.0 | Product1 | 736.0 | East | 736.000000 |
20 | 2023-01-21 | C | 82.0 | Product3 | 606.0 | South | 606.000000 |
21 | 2023-01-22 | C | 37.0 | Product2 | 992.0 | South | 992.000000 |
22 | 2023-01-23 | B | 62.0 | Product3 | 942.0 | North | 942.000000 |
23 | 2023-01-24 | C | 92.0 | Product2 | 342.0 | West | 342.000000 |
24 | 2023-01-25 | A | 24.0 | Product2 | 458.0 | East | 458.000000 |
25 | 2023-01-26 | C | 95.0 | Product1 | 584.0 | West | 584.000000 |
26 | 2023-01-27 | C | 71.0 | Product2 | 619.0 | North | 619.000000 |
27 | 2023-01-28 | C | 56.0 | Product2 | 224.0 | North | 224.000000 |
28 | 2023-01-29 | B | NaN | Product3 | 617.0 | North | 617.000000 |
29 | 2023-01-30 | C | 51.0 | Product2 | 737.0 | South | 737.000000 |
30 | 2023-01-31 | B | 50.0 | Product3 | 735.0 | West | 735.000000 |
31 | 2023-02-01 | A | 17.0 | Product2 | 189.0 | West | 189.000000 |
32 | 2023-02-02 | B | 63.0 | Product3 | 338.0 | South | 338.000000 |
33 | 2023-02-03 | C | 27.0 | Product3 | NaN | East | 557.130435 |
34 | 2023-02-04 | C | 70.0 | Product3 | 669.0 | West | 669.000000 |
35 | 2023-02-05 | B | 60.0 | Product2 | NaN | West | 557.130435 |
36 | 2023-02-06 | C | 36.0 | Product3 | 177.0 | East | 177.000000 |
37 | 2023-02-07 | C | 2.0 | Product1 | NaN | North | 557.130435 |
38 | 2023-02-08 | C | 94.0 | Product1 | 408.0 | South | 408.000000 |
39 | 2023-02-09 | A | 62.0 | Product1 | 155.0 | West | 155.000000 |
40 | 2023-02-10 | B | 15.0 | Product1 | 578.0 | East | 578.000000 |
41 | 2023-02-11 | C | 97.0 | Product1 | 256.0 | East | 256.000000 |
42 | 2023-02-12 | A | 93.0 | Product3 | 164.0 | West | 164.000000 |
43 | 2023-02-13 | A | 43.0 | Product3 | 949.0 | East | 949.000000 |
44 | 2023-02-14 | A | 96.0 | Product3 | 830.0 | East | 830.000000 |
45 | 2023-02-15 | B | 99.0 | Product2 | 599.0 | West | 599.000000 |
46 | 2023-02-16 | B | 6.0 | Product1 | 938.0 | South | 938.000000 |
47 | 2023-02-17 | B | 69.0 | Product3 | 143.0 | West | 143.000000 |
48 | 2023-02-18 | C | 65.0 | Product3 | 182.0 | North | 182.000000 |
49 | 2023-02-19 | C | 11.0 | Product3 | 708.0 | North | 708.000000 |
### renaming Columns
df_data_csv1=df_data_csv1.rename(columns={'Date':'Sale Date'})
df_data_csv1.head(5)
Sale Date | Category | Value | Product | Sales | Region | Sales_fillNA | |
---|---|---|---|---|---|---|---|
0 | 2023-01-01 | A | 28.0 | Product1 | 754.0 | East | 754.0 |
1 | 2023-01-02 | B | 39.0 | Product3 | 110.0 | North | 110.0 |
2 | 2023-01-03 | C | 32.0 | Product2 | 398.0 | East | 398.0 |
3 | 2023-01-04 | B | 8.0 | Product1 | 522.0 | East | 522.0 |
4 | 2023-01-05 | B | 26.0 | Product3 | 869.0 | North | 869.0 |
### change data type
### fillna is used since there are empty values, which will throw error
df_data_csv1['Value_newType']=df_data_csv1['Value'].fillna(df_data_csv1['Value'].mean()).astype(int)
print(df_data_csv1.dtypes)
print(df_data_csv1.head(5))
Sale Date object Category object Value float64 Product object Sales float64 Region object Sales_fillNA float64 Value_newType int64 dtype: object Sale Date Category Value Product Sales Region Sales_fillNA \ 0 2023-01-01 A 28.0 Product1 754.0 East 754.0 1 2023-01-02 B 39.0 Product3 110.0 North 110.0 2 2023-01-03 C 32.0 Product2 398.0 East 398.0 3 2023-01-04 B 8.0 Product1 522.0 East 522.0 4 2023-01-05 B 26.0 Product3 869.0 North 869.0 Value_newType 0 28 1 39 2 32 3 8 4 26
### using lamda expression
df_data_csv1['New_value']=df_data_csv1['Value'].apply(lambda x:x*2)
df_data_csv1.head(5)
Sale Date | Category | Value | Product | Sales | Region | Sales_fillNA | Value_newType | New_value | |
---|---|---|---|---|---|---|---|---|---|
0 | 2023-01-01 | A | 28.0 | Product1 | 754.0 | East | 754.0 | 28 | 56.0 |
1 | 2023-01-02 | B | 39.0 | Product3 | 110.0 | North | 110.0 | 39 | 78.0 |
2 | 2023-01-03 | C | 32.0 | Product2 | 398.0 | East | 398.0 | 32 | 64.0 |
3 | 2023-01-04 | B | 8.0 | Product1 | 522.0 | East | 522.0 | 8 | 16.0 |
4 | 2023-01-05 | B | 26.0 | Product3 | 869.0 | North | 869.0 | 26 | 52.0 |
df_data_csv1.head(5)
Sale Date | Category | Value | Product | Sales | Region | Sales_fillNA | Value_newType | New_value | |
---|---|---|---|---|---|---|---|---|---|
0 | 2023-01-01 | A | 28.0 | Product1 | 754.0 | East | 754.0 | 28 | 56.0 |
1 | 2023-01-02 | B | 39.0 | Product3 | 110.0 | North | 110.0 | 39 | 78.0 |
2 | 2023-01-03 | C | 32.0 | Product2 | 398.0 | East | 398.0 | 32 | 64.0 |
3 | 2023-01-04 | B | 8.0 | Product1 | 522.0 | East | 522.0 | 8 | 16.0 |
4 | 2023-01-05 | B | 26.0 | Product3 | 869.0 | North | 869.0 | 26 | 52.0 |
grouped_mean1=df_data_csv1.groupby('Product')['Value'].mean()
print(grouped_mean1)
Product Product1 46.214286 Product2 52.800000 Product3 55.166667 Name: Value, dtype: float64
grouped_sum2=df_data_csv1.groupby(['Product','Region'])['Value'].sum()
print(grouped_sum2)
Product Region Product1 East 292.0 North 9.0 South 100.0 West 246.0 Product2 East 56.0 North 127.0 South 181.0 West 428.0 Product3 East 202.0 North 203.0 South 215.0 West 373.0 Name: Value, dtype: float64
grouped_mean2=df_data_csv1.groupby(['Product','Region'])['Value'].mean()
print(grouped_mean2)
Product Region Product1 East 41.714286 North 4.500000 South 50.000000 West 82.000000 Product2 East 28.000000 North 63.500000 South 60.333333 West 53.500000 Product3 East 50.500000 North 40.600000 South 71.666667 West 62.166667 Name: Value, dtype: float64
#### aggregate multiple functions
grouped_agg1=df_data_csv1.groupby('Region')['Value'].agg(['mean','sum','count'])
grouped_agg1
mean | sum | count | |
---|---|---|---|
Region | |||
East | 42.307692 | 550.0 | 13 |
North | 37.666667 | 339.0 | 9 |
South | 62.000000 | 496.0 | 8 |
West | 61.588235 | 1047.0 | 17 |
### merging and joining data frames
data_frame3=pd.DataFrame({'key':['a','b','c'], 'value1':[1,2,3]})
data_frame4=pd.DataFrame({'key':['a','b','d'], 'value1':[4,5,6]})
print(data_frame3)
print(data_frame4)
key value1 0 a 1 1 b 2 2 c 3 key value1 0 a 4 1 b 5 2 d 6
### merging DataFrame on key
pd.merge(data_frame3,data_frame4,on="key",how="inner")
key | value1_x | value1_y | |
---|---|---|---|
0 | a | 1 | 4 |
1 | b | 2 | 5 |
pd.merge(data_frame3,data_frame4,on="key",how="outer")
key | value1_x | value1_y | |
---|---|---|---|
0 | a | 1.0 | 4.0 |
1 | b | 2.0 | 5.0 |
2 | c | 3.0 | NaN |
3 | d | NaN | 6.0 |
pd.merge(data_frame3,data_frame4,on="key",how="left")
key | value1_x | value1_y | |
---|---|---|---|
0 | a | 1 | 4.0 |
1 | b | 2 | 5.0 |
2 | c | 3 | NaN |
pd.merge(data_frame3,data_frame4,on="key",how="right")
key | value1_x | value1_y | |
---|---|---|---|
0 | a | 1.0 | 4 |
1 | b | 2.0 | 5 |
2 | d | NaN | 6 |
import pandas as pd
from io import StringIO
data_jason1= '{"employee_name":"james","email":"james@gmail.com","job_profile":[{"Title1":"analyst","Title2":"Sr. Dev"}]}'
df_data_jason1=pd.read_json(StringIO(data_jason1))
df_data_jason1
employee_name | job_profile | ||
---|---|---|---|
0 | james | james@gmail.com | {'Title1': 'analyst', 'Title2': 'Sr. Dev'} |
df_data_jason1.to_json()
'{"employee_name":{"0":"james"},"email":{"0":"james@gmail.com"},"job_profile":{"0":{"Title1":"analyst","Title2":"Sr. Dev"}}}'
df_data_jason1.to_json(orient='index')
'{"0":{"employee_name":"james","email":"james@gmail.com","job_profile":{"Title1":"analyst","Title2":"Sr. Dev"}}}'
df_data_jason1.to_json(orient='records')
'[{"employee_name":"james","email":"james@gmail.com","job_profile":{"Title1":"analyst","Title2":"Sr. Dev"}}]'
## reading from a website with CSV
pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",header=None)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 |
1 | 1 | 13.20 | 1.78 | 2.14 | 11.2 | 100 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050 |
2 | 1 | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185 |
3 | 1 | 14.37 | 1.95 | 2.50 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480 |
4 | 1 | 13.24 | 2.59 | 2.87 | 21.0 | 118 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
173 | 3 | 13.71 | 5.65 | 2.45 | 20.5 | 95 | 1.68 | 0.61 | 0.52 | 1.06 | 7.70 | 0.64 | 1.74 | 740 |
174 | 3 | 13.40 | 3.91 | 2.48 | 23.0 | 102 | 1.80 | 0.75 | 0.43 | 1.41 | 7.30 | 0.70 | 1.56 | 750 |
175 | 3 | 13.27 | 4.28 | 2.26 | 20.0 | 120 | 1.59 | 0.69 | 0.43 | 1.35 | 10.20 | 0.59 | 1.56 | 835 |
176 | 3 | 13.17 | 2.59 | 2.37 | 20.0 | 120 | 1.65 | 0.68 | 0.53 | 1.46 | 9.30 | 0.60 | 1.62 | 840 |
177 | 3 | 14.13 | 4.10 | 2.74 | 24.5 | 96 | 2.05 | 0.76 | 0.56 | 1.35 | 9.20 | 0.61 | 1.60 | 560 |
178 rows × 14 columns
### reading tables from web html
url1="https://www.fdic.gov/bank-failures/failed-bank-list"
df_url1=pd.read_html(url1)
df_url1[0]
Bank Name | City | State | Cert | Acquiring Institution | Closing Date | Fund Sort ascending | |
---|---|---|---|---|---|---|---|
0 | The Santa Anna National Bank | Santa Anna | Texas | 5520 | Coleman County State Bank | June 27, 2025 | 10549 |
1 | Pulaski Savings Bank | Chicago | Illinois | 28611 | Millennium Bank | January 17, 2025 | 10548 |
2 | The First National Bank of Lindsay | Lindsay | Oklahoma | 4134 | First Bank & Trust Co., Duncan, OK | October 18, 2024 | 10547 |
3 | Republic First Bank dba Republic Bank | Philadelphia | Pennsylvania | 27332 | Fulton Bank, National Association | April 26, 2024 | 10546 |
4 | Citizens Bank | Sac City | Iowa | 8758 | Iowa Trust & Savings Bank | November 3, 2023 | 10545 |
5 | Heartland Tri-State Bank | Elkhart | Kansas | 25851 | Dream First Bank, N.A. | July 28, 2023 | 10544 |
6 | First Republic Bank | San Francisco | California | 59017 | JPMorgan Chase Bank, N.A. | May 1, 2023 | 10543 |
7 | Signature Bank | New York | New York | 57053 | Flagstar Bank, N.A. | March 12, 2023 | 10540 |
8 | Silicon Valley Bank | Santa Clara | California | 24735 | First Citizens Bank & Trust Company | March 10, 2023 | 10539 |
9 | Almena State Bank | Almena | Kansas | 15426 | Equity Bank | October 23, 2020 | 10538 |
url2="https://passmovegrin.spawtz.com/Leagues/IndoorCricket/Scoresheet?FixtureId=2265"
df_url2=pd.read_html(url2)
df_url2
[ 0 1 2 3 4 5 6 7 0 NaN Skins Skins Skins Skins TOTAL NaN Points 1 NaN 1 2 3 4 TOTAL NaN Points 2 Pass Move Grin A24 29 22 31 29 111 (2 skins) NaN 5 3 Team TT Rockers (2) 22 32 34 12 100 (2 skins) NaN 2, 0 1 2 \ 0 Batting Team: Pass Move Grin A24 Pass Move Grin A24 1 NaN 1 Feroz Unknown 2 Chris Cooke 0 1 3 Martin Wagstaff NaN NaN 4 NaN NaN NaN 5 NaN 5 Vijay Unknown 6 darren shaw w w 7 gareth Unknown NaN NaN 8 NaN NaN NaN 9 NaN 9 Srini Unknown 10 Joe Unknown 1 NaN 11 perry Unknown NaN 3 12 NaN NaN NaN 13 NaN 13 Sathish Unknown 14 William Unknown 1 NaN 15 Andrew Unknown NaN 3 16 NaN NaN NaN 17 NaN NaN NaN 3 4 5 \ 0 Pass Move Grin A24 Pass Move Grin A24 Pass Move Grin A24 1 Feroz Unknown Feroz Unknown Feroz Unknown 2 NaN w w 3 1 NaN NaN 4 NaN NaN NaN 5 Vijay Unknown Vijay Unknown Vijay Unknown 6 1 NaN NaN 7 NaN 0 s 8 NaN NaN NaN 9 Srini Unknown Srini Unknown Srini Unknown 10 0 2 NaN 11 NaN NaN 1 12 NaN NaN NaN 13 Sathish Unknown Sathish Unknown Sathish Unknown 14 7 NaN NaN 15 NaN 0 2 16 NaN NaN NaN 17 NaN NaN NaN 6 7 8 \ 0 Pass Move Grin A24 Pass Move Grin A24 Pass Move Grin A24 1 Feroz Unknown Feroz Unknown 2 2 2 7 w 3 NaN 1 NaN 4 NaN 0/8 NaN 5 Vijay Unknown Vijay Unknown 6 6 NaN 5 1 7 w -3 NaN 8 NaN 1/2 NaN 9 Srini Unknown Srini Unknown 10 10 3 6 2 11 NaN 4 NaN 12 NaN 0/10 NaN 13 Sathish Unknown Sathish Unknown 14 14 3 11 w 15 NaN 5 NaN 16 NaN 0/16 NaN 17 NaN NaN NaN 9 ... 20 21 \ 0 Pass Move Grin A24 ... Pass Move Grin A24 Pass Move Grin A24 1 Gouti Unknown ... Venki Unknown Venki Unknown 2 1 ... NaN 3 3 NaN ... 1 0 4 NaN ... NaN 1/3 5 Siva Unknown ... Vijay Unknown Vijay Unknown 6 NaN ... NaN 3 7 1 ... c 1 8 NaN ... NaN 1/4 9 Akash Unknown ... Srini Unknown Srini Unknown 10 NaN ... 3 5 11 nb ... NaN -2 12 NaN ... NaN 1/3 13 Gouti Unknown ... Venki Unknown Venki Unknown 14 1 ... 2 3 15 NaN ... NaN 3 16 NaN ... NaN 0/6 17 NaN ... NaN NaN 22 23 24 \ 0 Pass Move Grin A24 Pass Move Grin A24 Pass Move Grin A24 1 4 Feroz Unknown Feroz Unknown 2 NaN w 1 3 2 NaN NaN 4 NaN NaN NaN 5 8 Siva Unknown Siva Unknown 6 1 NaN NaN 7 NaN 0 2 8 NaN NaN NaN 9 12 Akash Unknown Akash Unknown 10 nb 1 NaN 11 NaN NaN 1 12 NaN NaN NaN 13 16 Sathish Unknown Sathish Unknown 14 1 NaN b 15 NaN 2 NaN 16 NaN NaN NaN 17 NaN NaN NaN 25 26 27 \ 0 Pass Move Grin A24 Pass Move Grin A24 Pass Move Grin A24 1 Feroz Unknown Feroz Unknown Feroz Unknown 2 NaN 1 NaN 3 3 NaN 0 4 NaN NaN NaN 5 Siva Unknown Siva Unknown Siva Unknown 6 2 NaN 1 7 NaN 2 NaN 8 NaN NaN NaN 9 Akash Unknown Akash Unknown Akash Unknown 10 2 NaN 1 11 NaN 2 NaN 12 NaN NaN NaN 13 Sathish Unknown Sathish Unknown Sathish Unknown 14 2 NaN NaN 15 NaN 0 1 16 NaN NaN NaN 17 NaN NaN NaN 28 29 0 Pass Move Grin A24 Pass Move Grin A24 1 Feroz Unknown Total 2 4 18 3 5 11 4 0/9 29 5 Siva Unknown Total 6 4 17 7 4 5 8 0/8 22 9 Akash Unknown Total 10 6 20 11 3 11 12 0/9 31 13 Sathish Unknown Total 14 -2 15 15 3 14 16 1/1 29 17 NaN NaN [18 rows x 30 columns], 0 1 2 \ 0 Batting Team: Team TT Rockers (2) Team TT Rockers (2) 1 NaN 1 Joe Unknown 2 Akash Unknown 4 NaN 3 Sathish Unknown NaN 2 4 NaN NaN NaN 5 NaN 5 gareth Unknown 6 Srini Unknown 4 NaN 7 Siva Unknown NaN w 8 NaN NaN NaN 9 NaN 9 perry Unknown 10 Vijay Unknown 0 3 11 Gouti Unknown NaN NaN 12 NaN NaN NaN 13 NaN 13 William Unknown 14 Feroz Unknown 3 NaN 15 Venki Unknown NaN w 16 NaN NaN NaN 17 NaN NaN NaN 3 4 5 \ 0 Team TT Rockers (2) Team TT Rockers (2) Team TT Rockers (2) 1 Joe Unknown Joe Unknown Joe Unknown 2 3 NaN NaN 3 NaN 0 3 4 NaN NaN NaN 5 gareth Unknown gareth Unknown gareth Unknown 6 NaN 1 NaN 7 1 NaN 1 8 NaN NaN NaN 9 perry Unknown perry Unknown perry Unknown 10 NaN NaN NaN 11 w 0 w 12 NaN NaN NaN 13 William Unknown William Unknown William Unknown 14 NaN 1 NaN 15 1 NaN 1 16 NaN NaN NaN 17 NaN NaN NaN 6 7 8 \ 0 Team TT Rockers (2) Team TT Rockers (2) Team TT Rockers (2) 1 Joe Unknown Joe Unknown 2 2 c 2 0 3 NaN 5 NaN 4 NaN 1/7 NaN 5 gareth Unknown gareth Unknown 6 6 2 7 0 7 NaN 4 NaN 8 NaN 0/11 NaN 9 perry Unknown perry Unknown 10 10 NaN 3 0 11 nb 6 NaN 12 NaN 0/9 NaN 13 William Unknown William Unknown 14 14 0 4 NaN 15 NaN 4 c 16 NaN 0/8 NaN 17 NaN NaN NaN 9 ... 22 23 \ 0 Team TT Rockers (2) ... Team TT Rockers (2) Team TT Rockers (2) 1 darren shaw ... 4 perry Unknown 2 w ... nb 0 3 NaN ... NaN NaN 4 NaN ... NaN NaN 5 William Unknown ... 8 Chris Cooke 6 4 ... 3 NaN 7 NaN ... NaN 2 8 NaN ... NaN NaN 9 Joe Unknown ... 12 darren shaw 10 3 ... 0 1 11 NaN ... NaN NaN 12 NaN ... NaN NaN 13 Andrew Unknown ... 16 Chris Cooke 14 3 ... NaN r 15 NaN ... 3 NaN 16 NaN ... NaN NaN 17 NaN ... NaN NaN 24 25 26 \ 0 Team TT Rockers (2) Team TT Rockers (2) Team TT Rockers (2) 1 perry Unknown perry Unknown perry Unknown 2 1 NaN 0 3 NaN nb1 NaN 4 NaN NaN NaN 5 Chris Cooke Chris Cooke Chris Cooke 6 s 3 NaN 7 NaN NaN 1 8 NaN NaN NaN 9 darren shaw darren shaw darren shaw 10 NaN NaN NaN 11 w 0 w 12 NaN NaN NaN 13 Chris Cooke Chris Cooke Chris Cooke 14 NaN NaN NaN 15 0 w 2 16 NaN NaN NaN 17 NaN NaN NaN 27 28 29 \ 0 Team TT Rockers (2) Team TT Rockers (2) Team TT Rockers (2) 1 perry Unknown perry Unknown Total 2 w 5 17 3 NaN 3 5 4 NaN 0/8 22 5 Chris Cooke Chris Cooke Total 6 c -4 21 7 NaN 3 11 8 NaN 2/-1 32 9 darren shaw darren shaw Total 10 NaN 1 14 11 w 6 20 12 NaN 0/7 34 13 Chris Cooke Chris Cooke Chris Cooke 14 1 NaN NaN 15 NaN w 0 16 NaN NaN NaN 17 NaN NaN NaN 30 31 0 Team TT Rockers (2) Team TT Rockers (2) 1 NaN NaN 2 NaN NaN 3 NaN NaN 4 NaN NaN 5 NaN NaN 6 NaN NaN 7 NaN NaN 8 NaN NaN 9 NaN NaN 10 NaN NaN 11 NaN NaN 12 NaN NaN 13 Chris Cooke Total 14 -4 0 15 9 12 16 1/5 12 17 NaN NaN [18 rows x 32 columns], 0 \ 0 Pass Move Grin A24NameRSSROBRCWktsEconCChris C... 1 Player of the match: Chris Cooke (Pass Move Gr... 1 0 Team TT Rockers (2)NameRSSROBRCWktsEconCSrini ... 1 Player of the match: Chris Cooke (Pass Move Gr... , 0 1 2 \ 0 Pass Move Grin A24 Pass Move Grin A24 Pass Move Grin A24 1 Name RS SR 2 Chris Cooke 18 138.46 3 Andrew Unknown 14 107.69 4 Martin Wagstaff 11 100.00 5 darren shaw 17 130.77 6 Joe Unknown 20 166.67 7 perry Unknown 11 91.67 8 William Unknown 15 136.36 9 gareth Unknown 5 45.45 3 4 5 \ 0 Pass Move Grin A24 Pass Move Grin A24 Pass Move Grin A24 1 OB RC Wkts 2 2 4 3 3 2 4 2 4 2 4 2 5 2 13 1 6 2 22 1 7 2 17 0 8 2 22 0 9 2 14 0 6 7 0 Pass Move Grin A24 Pass Move Grin A24 1 Econ C 2 2.0 14 3 2.0 10 4 2.0 7 5 6.5 4 6 11.0 -2 7 8.5 -6 8 11.0 -7 9 7.0 -9 , 0 1 2 \ 0 Team TT Rockers (2) Team TT Rockers (2) Team TT Rockers (2) 1 Name RS SR 2 Srini Unknown 21 150.00 3 Vijay Unknown 14 116.67 4 Gouti Unknown 20 166.67 5 Venki Unknown 12 80.00 6 Akash Unknown 17 106.25 7 Siva Unknown 11 110.00 8 Sathish Unknown 5 62.50 9 Feroz Unknown 0 0.00 3 4 5 \ 0 Team TT Rockers (2) Team TT Rockers (2) Team TT Rockers (2) 1 OB RC Wkts 2 2 13 1 3 2 6 2 4 2 15 0 5 2 9 1 6 2 18 0 7 2 16 0 8 2 17 1 9 2 17 0 6 7 0 Team TT Rockers (2) Team TT Rockers (2) 1 Econ C 2 6.5 8 3 3.0 8 4 7.5 5 5 4.5 3 6 9.0 -1 7 8.0 -5 8 8.5 -12 9 8.5 -17 , 0 1 0 NaN Share this scoresheet on Facebook, 0 1 0 NaN Tweet this scoresheet]
url3="https://www.gouti1454.com/p/iso-27001.html"
df_url3=pd.read_html(url3)
df_url3[0]
0 | 1 | 2 | 3 | 4 | 5 | 6 | |
---|---|---|---|---|---|---|---|
0 | ISO/IEC 27002 control identifier | Control name | Control type | Information security properties | Cybersecurity concepts | Operational capabilities | Security domains |
1 | 5.1 | Policies for information security | #Preventive | #Confidentiality #Integrity #Availability | #Identify | #Governance | #Governance_ and_Ecosys- tem #Resil- ience |
2 | 5.2 | Information security roles and responsi- bili... | #Preventive | #Confidentiality #Integrity #Availability | #Identify | #Governance | #Govern- ance_and_ Ecosystem#Protection#Resil... |
3 | 5.3 | Segregation of duties | #Preventive | #Confidentiality #Integrity #Availability | #Protect | #Governance #Identity_and_access_man-agement | #Governance_ and_Ecosys- tem |
4 | 5.4 | Management responsibili- ties | #Preventive | #Confidentiality #Integrity #Availability | #Identify | #Governance | #Governance_ and_Ecosys- tem |
... | ... | ... | ... | ... | ... | ... | ... |
89 | 8.30 | Outsourced development | #Preventive #Detective | #Confidentiality #Integrity #Availability | #Identify #Protect #Detect | #System_and_ network_secu- rity #Applica- tio... | #Governance_ and_Ecosys- tem #Protec- tion |
90 | 8.31 | Separation of develop- ment, test and product... | #Preventive | #Confidentiality #Integrity #Availability | #Protect | #Applica-tion_security#System_and_network_secu... | #Protection |
91 | 8.32 | Change man-agement | #Preventive | #Confidentiality #Integrity #Availability | #Protect | #Applica-tion_security#System_and_network_secu... | #Protection |
92 | 8.33 | Test informa- tion | #Preventive | #Confidentiality #Integrity | #Protect | #Information_protection | #Protection |
93 | 8.34 | Protection of informa- tion systems during au... | #Preventive | #Confidentiality #Integrity #Availability | #Protect | #System_and_ network_secu- rity #Informa- tio... | #Governance_ and_Ecosys- tem #Protec- tion |
94 rows × 7 columns
#### taking mobile country code and using key word pair
url4="https://en.wikipedia.org/wiki/Mobile_country_code"
df_url4=pd.read_html(url4,match="Country",header=0)[0]
df_url4
Mobile country code | Country | ISO 3166 | Mobile network codes | National MNC authority | Remarks | |
---|---|---|---|---|---|---|
0 | 289 | A Abkhazia | GE-AB | List of mobile network codes in Abkhazia | NaN | MCC is not listed by ITU |
1 | 412 | Afghanistan | AF | List of mobile network codes in Afghanistan | NaN | NaN |
2 | 276 | Albania | AL | List of mobile network codes in Albania | NaN | NaN |
3 | 603 | Algeria | DZ | List of mobile network codes in Algeria | NaN | NaN |
4 | 544 | American Samoa (United States of America) | AS | List of mobile network codes in American Samoa | NaN | NaN |
... | ... | ... | ... | ... | ... | ... |
247 | 452 | Vietnam | VN | List of mobile network codes in the Vietnam | NaN | NaN |
248 | 543 | W Wallis and Futuna | WF | List of mobile network codes in Wallis and Futuna | NaN | NaN |
249 | 421 | Y Yemen | YE | List of mobile network codes in the Yemen | NaN | NaN |
250 | 645 | Z Zambia | ZM | List of mobile network codes in Zambia | NaN | NaN |
251 | 648 | Zimbabwe | ZW | List of mobile network codes in Zimbabwe | NaN | NaN |
252 rows × 6 columns
import usp
from usp.tree import sitemap_tree_for_homepage
import requests
url6="https://www.gouti1454.com//sitemap-pages.xml"
SiteMap_Page1=sitemap_tree_for_homepage(url6)
for page1 in SiteMap_Page1.all_pages():
print(page1)
import advertools as adv
SiteMap_Page2=adv.sitemap_to_df(url6)
SiteMap_Page2.head(5)
SiteMap_Page3=adv.url_to_df(SiteMap_Page2['loc'])
SiteMap_Page3.head(5)
### read excel
## by default reads the first sheet.
df_excel1=pd.read_excel('All_Regions_RAW.xlsx', sheet_name='PPM Details')
df_excel1.head(5)
S.No | Region | Site | Building | Floor | Ward | Room | Importance | Predicted Time | Annual Cost | ... | Occurrence Release Date | Occurrence Expiry Date | Occurrence Notes | Resource | Resource Type | From Time | To Time | Hours Recorded | Cost | Hourly Rate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Shropshire MPFT | The Redwoods Centre | Wenlock Building | First Floor | Nill | SHPRREB1011FEOPR1 Estates - Plant Room | Important | 0:00 | 0.0 | ... | 2023-09-04 | 2023-09-11 | Nill | Andy Davies | Maintenance Assistant | 2023-09-08 09:18:00 | 2023-09-08 09:43:00 | 0.416666 | 7.9542 | 19.09 |
1 | 2 | Shropshire Community Health NHS Trust - Sites | Whitchurch Hospital | Whitchurch Hospital Main Block | Ground Floor | Nill | Nill | Important | 0:00 | 0.0 | ... | 2023-09-04 | 2023-09-11 | Nill | Sam Lewis | Plumber/Fitter | 2023-09-05 15:07:00 | 2023-09-05 16:42:00 | 1.583333 | 43.4625 | 27.45 |
2 | 3 | Shropshire Community Health NHS Trust - Sites | Bridgnorth Hospital | Bridgnorth Hospital Main Block | Ground Floor | Nill | Nill | Important | 0:00 | 0.0 | ... | 2023-09-04 | 2023-09-11 | Nill | Lance Spreadbury | Maintenance Assistant | 2023-09-08 14:15:00 | 2023-09-08 14:45:00 | 0.500000 | 8.3150 | 16.63 |
3 | 4 | Shropshire MPFT | The Redwoods Centre | Wenlock Building | First Floor | Nill | SHPRREB1011FEOPR1 Estates - Plant Room | Important | 0:00 | 0.0 | ... | 2023-09-11 | 2023-09-18 | Nill | Ian Griffiths | Plumber/Fitter | 2023-09-14 09:19:00 | 2023-09-14 10:39:00 | 1.333333 | 36.6000 | 27.45 |
4 | 5 | Shropshire Community Health NHS Trust - Sites | Whitchurch Hospital | Whitchurch Hospital Main Block | Ground Floor | Nill | Nill | Important | 0:00 | 0.0 | ... | 2023-09-11 | 2023-09-18 | Nill | Sam Lewis | Plumber/Fitter | 2023-09-15 19:07:00 | 2023-09-15 19:29:00 | 0.366666 | 10.0650 | 27.45 |
5 rows × 37 columns
#### Converting excel in to pickel file
df_excel1.to_pickle('df_excel1')
pd.read_pickle('df_excel1')
S.No | Region | Site | Building | Floor | Ward | Room | Importance | Predicted Time | Annual Cost | ... | Occurrence Release Date | Occurrence Expiry Date | Occurrence Notes | Resource | Resource Type | From Time | To Time | Hours Recorded | Cost | Hourly Rate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Shropshire MPFT | The Redwoods Centre | Wenlock Building | First Floor | Nill | SHPRREB1011FEOPR1 Estates - Plant Room | Important | 0:00 | 0.0 | ... | 2023-09-04 | 2023-09-11 | Nill | Andy Davies | Maintenance Assistant | 2023-09-08 09:18:00 | 2023-09-08 09:43:00 | 0.416666 | 7.9542 | 19.09 |
1 | 2 | Shropshire Community Health NHS Trust - Sites | Whitchurch Hospital | Whitchurch Hospital Main Block | Ground Floor | Nill | Nill | Important | 0:00 | 0.0 | ... | 2023-09-04 | 2023-09-11 | Nill | Sam Lewis | Plumber/Fitter | 2023-09-05 15:07:00 | 2023-09-05 16:42:00 | 1.583333 | 43.4625 | 27.45 |
2 | 3 | Shropshire Community Health NHS Trust - Sites | Bridgnorth Hospital | Bridgnorth Hospital Main Block | Ground Floor | Nill | Nill | Important | 0:00 | 0.0 | ... | 2023-09-04 | 2023-09-11 | Nill | Lance Spreadbury | Maintenance Assistant | 2023-09-08 14:15:00 | 2023-09-08 14:45:00 | 0.500000 | 8.3150 | 16.63 |
3 | 4 | Shropshire MPFT | The Redwoods Centre | Wenlock Building | First Floor | Nill | SHPRREB1011FEOPR1 Estates - Plant Room | Important | 0:00 | 0.0 | ... | 2023-09-11 | 2023-09-18 | Nill | Ian Griffiths | Plumber/Fitter | 2023-09-14 09:19:00 | 2023-09-14 10:39:00 | 1.333333 | 36.6000 | 27.45 |
4 | 5 | Shropshire Community Health NHS Trust - Sites | Whitchurch Hospital | Whitchurch Hospital Main Block | Ground Floor | Nill | Nill | Important | 0:00 | 0.0 | ... | 2023-09-11 | 2023-09-18 | Nill | Sam Lewis | Plumber/Fitter | 2023-09-15 19:07:00 | 2023-09-15 19:29:00 | 0.366666 | 10.0650 | 27.45 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5534 | 5535 | North Staffordshire MPFT | Cheadle Hospital | Cheadle Hospital | Lower Ground Floor | Nill | Nill | Important | 0:00 | 0.0 | ... | 2023-09-25 | 2023-10-02 | \nSep 27 2023 8:00PM Paul White Occurrence cl... | Paul White | Maintenance Assistant | 2023-09-27 15:08:00 | 2023-09-27 18:00:00 | 2.866666 | 47.6727 | 16.63 |
5535 | 5536 | North Staffordshire MPFT | Cheadle Hospital | Cheadle Hospital | Lower Ground Floor | Nill | Nill | Important | 0:00 | 0.0 | ... | 2023-09-25 | 2023-10-02 | \nSep 27 2023 8:00PM Paul White Occurrence cl... | Paul White | Maintenance Assistant | 2023-09-25 00:00:00 | 2023-09-25 00:00:00 | 0.000000 | 0.0000 | 16.63 |
5536 | 5537 | South Staffordshire MPFT | St Georges Hospital, Stafford | St Chads - Main Corridor | Ground Floor | Nill | STFRRE1140CGF014 Room – Treatment (ECT) | Important | 0:00 | 0.0 | ... | 2023-07-10 | 2023-08-07 | 02 Aug 23 13:03:56 - Colin Winfindale - Job c... | Colin Winfindale | Maintenance Assistant | 2023-08-02 12:45:00 | 2023-08-02 13:03:00 | 0.300000 | 5.4420 | 18.14 |
5537 | 5538 | South Staffordshire MPFT | St Georges Hospital, Stafford | St Chads - Main Corridor | Ground Floor | Nill | STFRRE1140CGF014 Room – Treatment (ECT) | Important | 0:00 | 0.0 | ... | 2023-08-09 | 2023-09-06 | 06 Sep 23 09:51:50 - Colin Winfindale - Job c... | Colin Winfindale | Maintenance Assistant | 2023-09-06 09:26:00 | 2023-09-06 09:51:00 | 0.416666 | 7.5583 | 18.14 |
5538 | 5539 | South Staffordshire MPFT | St Georges Hospital, Stafford | St Chads - Main Corridor | Ground Floor | Nill | STFRRE1140CGF014 Room – Treatment (ECT) | Important | 0:00 | 0.0 | ... | 2023-09-08 | 2023-10-06 | Nill | Colin Winfindale | Maintenance Assistant | 2023-09-08 00:00:00 | 2023-09-08 00:00:00 | 0.000000 | 0.0000 | 18.14 |
5539 rows × 37 columns
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
x_axis1=[1,2,3,4,5]
y_axis1=[1,4,9,16,25]
## create a line plot
plt.plot(x_axis1,y_axis1)
plt.xlabel('x axis')
plt.ylabel('y axis')
plt.title("Line plot")
plt.show()
x_axis1=[1,2,3,4,5]
y_axis1=[1,4,9,16,25]
## customised linr plot
plt.plot(x_axis1,y_axis1,color='red',linestyle='--',marker='o',linewidth=3,markersize=9)
plt.grid(True)
### Multple plots
x_axis1=[1,2,3,4,5]
y1_axis1=[1,4,9,16,25]
y2_axis2=[1,2,4,5,6]
plt.figure(figsize=(9,5)) ## figure size is 9,5
plt.subplot(2,2,1) ### one row, two columns and where the graph needs to be placed
plt.plot(x_axis1,y1_axis1,color='green')
plt.title('plot 1')
####
plt.subplot(2,2,2) ### one row, two columns and where the graph needs to be placed
plt.plot(y1_axis1,x_axis1,color='red')
plt.title('plot 2')
###
plt.subplot(2,2,3) ### one row, two columns and where the graph needs to be placed
plt.plot(y1_axis1,x_axis1,color='blue')
plt.title('plot 3')
###
plt.subplot(2,2,4) ### one row, two columns and where the graph needs to be placed
plt.plot(y2_axis2,x_axis1,color='orange')
plt.title('plot 4')
Text(0.5, 1.0, 'plot 4')
### BAR plot
categories_1=['a','b','c','d','e']
values_1=[5,7,9,3,6]
## Bar plot
plt.bar(categories_1,values_1,color="red")
<BarContainer object of 5 artists>
his_data_1=[1,2,2,3,3,3,4,4,4,4,5,5,5,5,5]
## histogram
plt.hist(his_data_1,bins=5,edgecolor='black')
(array([1., 2., 3., 4., 5.]), array([1. , 1.8, 2.6, 3.4, 4.2, 5. ]), <BarContainer object of 5 artists>)
### scatter plot
x_scatt1=[1,2,3,4,5]
y_scatt1=[2,3,4,5,6]
plt.scatter(x_scatt1,y_scatt1,color='red',marker='*')
<matplotlib.collections.PathCollection at 0x13810eda850>
### pie chart
lables_pie=['a','b','c','d']
sizes_pie=[30,20,40,10]
colors_pie=['gold','red','pink','green']
explode_pie=[0.2,0,0,0]
plt.pie(sizes_pie,explode=explode_pie,labels=lables_pie,colors=colors_pie,autopct='%1.1f%%',shadow=True)
([<matplotlib.patches.Wedge at 0x13810b6f390>, <matplotlib.patches.Wedge at 0x13810b6f890>, <matplotlib.patches.Wedge at 0x13810b6fd90>, <matplotlib.patches.Wedge at 0x13810be82d0>], [Text(0.7641207377093499, 1.0517221582730485, 'a'), Text(-0.8899187390319623, 0.6465637152823859, 'b'), Text(-0.3399188151951704, -1.046162128485022, 'c'), Text(1.0461621838648125, -0.339918644753721, 'd')], [Text(0.47022814628267684, 0.6472136358603374, '30.0%'), Text(-0.4854102212901612, 0.3526711174267559, '20.0%'), Text(-0.1854102628337293, -0.5706338882645574, '40.0%'), Text(0.5706339184717159, -0.185410169865666, '10.0%')])
### sales data.csv
import pandas as pd
sales_data1_df=pd.read_csv('data.csv')
sales_data1_df.head(5)
Date | Category | Value | Product | Sales | Region | |
---|---|---|---|---|---|---|
0 | 2023-01-01 | A | 28.0 | Product1 | 754.0 | East |
1 | 2023-01-02 | B | 39.0 | Product3 | 110.0 | North |
2 | 2023-01-03 | C | 32.0 | Product2 | 398.0 | East |
3 | 2023-01-04 | B | 8.0 | Product1 | 522.0 | East |
4 | 2023-01-05 | B | 26.0 | Product3 | 869.0 | North |
sales_data1_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 50 entries, 0 to 49 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 50 non-null object 1 Category 50 non-null object 2 Value 47 non-null float64 3 Product 50 non-null object 4 Sales 46 non-null float64 5 Region 50 non-null object dtypes: float64(2), object(4) memory usage: 2.5+ KB
## plot total sales by products
tot_sales_by_prod=sales_data1_df.groupby('Category')['Value'].sum()
print(tot_sales_by_prod)
Category A 677.0 B 618.0 C 1137.0 Name: Value, dtype: float64
tot_sales_by_prod.plot(kind='bar')
<Axes: xlabel='Category'>
## plot sales trent over Date
sales_trend=sales_data1_df.groupby('Date')['Sales'].sum().reset_index()
print(sales_trend.head(5))
plt.plot(sales_trend['Date'],sales_trend["Sales"])
2025-07-14 06:59:35,569 | INFO | category.py:224 | update | Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. 2025-07-14 06:59:35,573 | INFO | category.py:224 | update | Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Date Sales 0 2023-01-01 754.0 1 2023-01-02 110.0 2 2023-01-03 398.0 3 2023-01-04 522.0 4 2023-01-05 869.0
[<matplotlib.lines.Line2D at 0x13810dc8f50>]
Built on matplotlib comples visualization
import seaborn as sns
## tips data set is present in the library
tips1=sns.load_dataset('tips')
tips1
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
... | ... | ... | ... | ... | ... | ... | ... |
239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
import matplotlib.pyplot as plt
sns.scatterplot(x='total_bill',y='tip',data=tips1)
plt.title('Scatter plot Total bill Vs Tip')
plt.show()
### line plot
sns.lineplot(x='size',y='total_bill',data=tips1)
plt.title('Scatter plot Total bill Vs Size')
plt.show()
## categorical Plots
## bar plot
sns.barplot(x='day',y='total_bill',data=tips1)
plt.title('Scatter plot Total bill Vs Day')
plt.show()
## box plot
sns.boxplot(x='day',y='total_bill',data=tips1)
plt.title('Scatter plot Total bill Vs Day')
plt.show()
### violin
sns.violinplot(x='day',y='total_bill',data=tips1)
plt.title('Scatter plot Total bill Vs Day')
plt.show()
### histogram
sns.histplot(tips1['total_bill'],bins=10,kde=True)
plt.show()
###KDE plot
sns.kdeplot(tips1['total_bill'],fill=True)
<Axes: xlabel='total_bill', ylabel='Density'>
## pair plot only with numnerical variables
sns.pairplot(tips1)
<seaborn.axisgrid.PairGrid at 0x13810ab7110>
### heat map
corr_data1=tips1[['total_bill','tip','size']].corr()
sns.heatmap(corr_data1,annot=True,cmap='coolwarm')
<Axes: >
sales_data1_df=pd.read_csv('data.csv')
sales_data1_df.head(5)
Date | Category | Value | Product | Sales | Region | |
---|---|---|---|---|---|---|
0 | 2023-01-01 | A | 28.0 | Product1 | 754.0 | East |
1 | 2023-01-02 | B | 39.0 | Product3 | 110.0 | North |
2 | 2023-01-03 | C | 32.0 | Product2 | 398.0 | East |
3 | 2023-01-04 | B | 8.0 | Product1 | 522.0 | East |
4 | 2023-01-05 | B | 26.0 | Product3 | 869.0 | North |
### plot total sales by product
plt.figure(figsize=(10,6))
sns.barplot(x='Category',y='Sales',data=sales_data1_df,estimator=sum)
<Axes: xlabel='Category', ylabel='Sales'>
''' is science of collecting, organizing, analyzing, of data = "facts or peice of information" which can be "measured,collected,analyzed" in decision making
Data Analyst -> reports -> visulization -> menaingful decisions Data Scientist -> model -> prediction
Applications:
Types of Statistics:
Descriptive Statistics = summerizing & organizing data to make it understanding
Inferential ststicstics = involves methods making predictions or inferences -making conclusions from the sample data which is from the population
Population and sample data Population: Entire set Sample data : subset of the population
Types of Sampling Techniques
probability sampling a. simple random sampling b. systematic sampling c. stratified sampling d. cluster sampling e. multistage sampling
non probability sampling a. Convenience sampling b. judgmental sampling c. snowball sampling d. quta sampling
DATA
Quantative Discrete = whole numbers & +ve number e.g : no of childers in the family . Continuous = Any numerical values e.g: weights, heights, temperature , speed.
Qualitative data nominal data: no ranking e.g Gender Blood group Pincode ordinal data : Ranks can be assigned e.g: Customer feedback = good, bad, better
Scale of measurements of data
nominal scale -categories dosent have any intrinsic orders -qualitative/ categorical data Charteristeristics: - data caterogrized based on names,labels or qualites e.g gender male or female - categories are mutually exclusive - no logical order or no ranking
ordinal scale -Classifies data in to categories, that can be ranked and ordered Charaterstics: - ranked in specific oder - interval b/w ranks are not necessarily equal.
interval scale -Not only categories and ranks but also specify the excat difference b/w intervals. It lacks true zero points Charaterictics: - Data is ordered with consistant intervals b/w values - allows for meaningful comparisons of differences - no true zero point e.g- temperatue in fahrenheit - zero farhrenheit dosent mean theis no temperature - years - there is no zero year
ratio -the order mattrers -difference are measurables [Ratio can be measured] -contains a Zero starting point e.g- Marks for a student start from zero
Measure of central tendencey
mean = normal distribution median = skewed distributions
real world applications Feature Engineerings: [age, sex, salary, city,]
where are there missing values, based on the the distribiution, where the distribution is right skewed then median can be used.
Common measures of dispersion 1. range - Difference b/w maximum value and minimum value of the dataset. - Range= maximum value - Minimum value Characteristics: - simple to calulate - sensistive to outlayers - rough measure of dispersion
2. variance
- measures the average squared deviation of each value from the mean.
- provides a sense of how much the values vary in the dataset
- Population variance
sigma square
- Sample variance
S**2 means 's' square
Charateristcs:
- Provides a precise measure of variability
- units are squared of the original data units
- more sensistive to outlayers than range
3. standard deviation
- SD is the square root of the variance
charateristics:
- provides clear measure of spread in the same units as the data
- sensitive to outlayers
4. interquatile range IQR
Random Variables
Percentile
5 Number summary and box plot
Removing the Outliers x=[1,2,2,2,3,3,4,5,5,5,6,6,6,6,7,8,9,9,29]
What to consider is Lower fence and higher fence
Compute lower fence:
lower fence = Q1-1.5[Inter quaterile range IQR]
HIger fence = Q3+1.5[IQR]
Q1=25%
Q1=25/100 *(19+1)=5 = 5th value
Q1=3
Q3=75%= 75/100*(20)= 15 =15th value
Q3=7
IQR = Q3 - Q1 = 7- 3 = 4
Lower fence = Q1 - 1.5(IQR) = 3-1.5(4)= -3
Higher fence = Q3 + 1.5(IQR) = 7+1.5(4)= 13
- This will the acceptable range [-3, 13] and any thing outside the layers will be outliers
5 Number summary and box plot:
Minimum = 1
Q1 = 3
Median = 5
Q3 = 7
maximum = 9 [exceluding outliers]
Histogram and Skewness:
e.g Age={11,12,13,14,15,18,24,30,35,36,40,42,43,50}
values range from 0 to 50
1. No fo bins to be decided
just lets take 10 here , max value 50/10 = 5 bin size
bins {0-5,5-10,10,15,..45-50}
Covariance and Correlations
Covariance:
- variables tend to increase and decrease together is positive variance.
- variables tend to increase when other decreases is negative covariance.
Advantages:
- Quantify the relations b/w x & y
Disadvantages
- Covariance dosent have specific limits, as limits vary fom - infinity to + infinity
COrrelation:
- Person correlation = limits b/w [-1, +1]
- The more the value towards +1 the more +ve correlated X & Y.
- The more the value towards -1 the more -ve correlated x & y.
# when data is not in a linear line, i.e. non-linear data points, where person correlation will not be equal to 1
- spearman rank correlation:
# when the data points follow non-linear path, spearman correlation will help in reaching value =1
Probability Distribution functions: PDF 1. probability Mass funciton [pmf] 2. Probability density function [pdf] 3. Cumulative distirbution function [cdf] PDF properties - non negativity - the total area under the PDF curve is equal to 1
Types of probability distribution:[pmf,pdf,cdf]
Data set = follows Distribution
1. Bernoulli distribution = outcome is alwys is binary [pmf]= discrete random variable.
2. Binomial distribution = [pmf]
3. NOminal/ Gausian Dristribution = [pdf] = continious discrete random variable
4. Poisson Distribution = [pmf]
5. Log normal distribution [pmf]
6. uniform distribution [pmf]
Bernoulli distribution: Discrete random variable
Binominal Distribution: with parameters n and p is dicrete probability of the number of success in a sequence of n independent experience.
- Discrete random variable
- BInary outcome [yes or no]
- these experienced are perfromed for n trials
Poisson Distribution: - Discrete random variable - describes the number of event occuring in the fixed time intervals e.g. - no of people visiting hospital every hour.
Normal or Gaussian distribution: - continous random variable
- e.g. weights of students in class will follow normal distribution
heights of students in class will follow normal distribution
- where mean = 0 and SD =1
- Dataset example
Age[yrs], Weight[kg], Height[Cms], Salary[INR]
Here all these needs to be brought under same units of measure, will help in LLm models to give better results
- now this process is called standardization using z score
Uniform Distribution - Continious unifrom distribution [pdf] - Discrete unifrom distribution [pmf]
Continious unifrom distribution [pdf]:
- e.g. no of candies sold daily in a shop is uniformily distributed with a maximuim of 40 candies and a minium of 10
- 1. probability of daily sales to fall b/w 15 and 30
Discrete unifrom distribution [pmf]
- Discrete random variable
- e.g rolling a dice
- the out come will x=[1,2,3,4,5,6], the probability of out come will be
Pr(1)=1/6
Pr(2)=1/6
Pr(3)=1/6
Pr(4)=1/6
Pr(5)=1/6
Pr(6)=1/6
Log Normal distribution: - its right skwed distribution - e.g. - Wealth distribution of the world - Discussion forum: small % will write less words and other few % write longer words and major average words
Power Log Distribution : - functional relationship b/w two quantities. - 80 :20 % rule - apply Box Cox to convert into normal distribution -e.g. - IPL 20% of the team is responsoible for wining 80% of the match - 80% of wealth are distributed with 20% of total population - 80% of crude oil is within 20% of the nations
Pareto Distribution : [Non Gaussian Distribution] - Box Cox tranformation is used to convert in to normal distribution - always follows power law distribution - e.g. - 80% of the entire IT project is done by 20% of the team - 80% of the defects can be solved if we save 20% of main defects
Central limit therom: - relies on the concept of sampling distribution - which is distribu of a statics for large number of samples taken from a population - sample distribution of the mean will always be normally distributed.
Hypothesis testing : - infrencital statistics = come up with conclusion or inference. - with help of sample data will come to conclusion of the population.
1. Null Hypothesis (Ho)
2. Alternative Hypothesis (H1)
- opposite fof Null hypothesis
3. EXpirements- Statistical analysis
4. Accpet the null hypothesis or reject the null hypothesis
- P value
- P < 0.05 reject null hypothesis
- Z test:
- Pupolation STD deviation
- n >=30
- 2 tail test
- t distribution
- t test
-Type 1 and Type 2 error
- relaity : null hypothis is true or null nypothesis is false
- Decision : Null hypothesis is true or null hypothesis is false
outcome 1:
- we reject the null hypothesis when
in reality it is false -> Correct
outcome 2:
- we reject the null hypothesis when
in reality it is true - > type 1 error
outcome 3:
- we retain the null hypothesis when
in relaity it is false -> type 2 error
outcome 4:
- we retain the null hypothsis when
in relaity it is true -> correct
Bayes Therom [ Bayes statistics]
- Probability
- independent events
- dependent events
Independent events:
E.g.
Rolling a dice [not dependent on each other]
Tossing coin
Dependent events:
E.g.
3 reg marbels, 2 yellow marlbes , now picking a red then next event picking a yellow, the probalilty of occurance gets affected.
Confidence Interval CHI Square Test ANOVA - analysis of variance
Featuring Engrineereing of Missing Datas: Missing Values: there are 3 mechanisms 1. MIssing Completely at Random [MCAR] - MIssaing variables are randomly distributed - e.g. - if the survey particants wiht missing values of vcertain questions were slelcted random 2. Missing at random: [MAR] - probability of a value missing defepnds only on the observed data. - missing values aare systemetically related to the observed data - e.g. - suppose collecting income from a group of people, some choose not to report their income - men are not disclossing salary and women are not updating age 3. MIssing data not at randon: - where the probality of missing vlaues depends on the value of the missing data itself -e.g - Collecting income and job satisfaction, if employees who are less satisfied with jobs are more likely to refuse to report their income,
'''
'''
Featuring Engrineereing of Missing Datas:
Missing Values:
there are 3 mechanisms
'''
import seaborn as sns
df_titan=sns.load_dataset('titanic')
df_titan.head(5)
survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
### missing values
#df_titan.isnull()
df_titan.isnull().sum()
survived 0 pclass 0 sex 0 age 177 sibsp 0 parch 0 fare 0 embarked 2 class 0 who 0 adult_male 0 deck 688 embark_town 2 alive 0 alone 0 dtype: int64
## deleted the rows or data points
df_titan.shape
(891, 15)
# when you drop the na's, will reduce the dataset count
df_titan.dropna().shape
(182, 15)
## column wise deletion
df_titan.dropna(axis=1)
survived | pclass | sex | sibsp | parch | fare | class | who | adult_male | alive | alone | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 1 | 0 | 7.2500 | Third | man | True | no | False |
1 | 1 | 1 | female | 1 | 0 | 71.2833 | First | woman | False | yes | False |
2 | 1 | 3 | female | 0 | 0 | 7.9250 | Third | woman | False | yes | True |
3 | 1 | 1 | female | 1 | 0 | 53.1000 | First | woman | False | yes | False |
4 | 0 | 3 | male | 0 | 0 | 8.0500 | Third | man | True | no | True |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 0 | 2 | male | 0 | 0 | 13.0000 | Second | man | True | no | True |
887 | 1 | 1 | female | 0 | 0 | 30.0000 | First | woman | False | yes | True |
888 | 0 | 3 | female | 1 | 2 | 23.4500 | Third | woman | False | no | False |
889 | 1 | 1 | male | 0 | 0 | 30.0000 | First | man | True | yes | True |
890 | 0 | 3 | male | 0 | 0 | 7.7500 | Third | man | True | no | True |
891 rows × 11 columns
Imputation techniques how handling missing values
#1. Mean Value imputation
sns.displot(df_titan['age'],kde=True)
<seaborn.axisgrid.FacetGrid at 0x1384c252210>
df_titan=sns.load_dataset('titanic')
df_titan['Age_mean']=df_titan['age'].fillna(df_titan['age'].mean())
df_titan[['Age_mean','age']]
### nan values are replace in the new column
Age_mean | age | |
---|---|---|
0 | 22.000000 | 22.0 |
1 | 38.000000 | 38.0 |
2 | 26.000000 | 26.0 |
3 | 35.000000 | 35.0 |
4 | 35.000000 | 35.0 |
... | ... | ... |
886 | 27.000000 | 27.0 |
887 | 19.000000 | 19.0 |
888 | 29.699118 | NaN |
889 | 26.000000 | 26.0 |
890 | 32.000000 | 32.0 |
891 rows × 2 columns
### mean imputation works well when normally distributed data
## 2. Meadian value imputation
## where the distribution is non-normal or skwed distribution
df_titan['Age_median']=df_titan['age'].fillna(df_titan['age'].median())
df_titan[['Age_median','age']]
Age_median | age | |
---|---|---|
0 | 22.0 | 22.0 |
1 | 38.0 | 38.0 |
2 | 26.0 | 26.0 |
3 | 35.0 | 35.0 |
4 | 35.0 | 35.0 |
... | ... | ... |
886 | 27.0 | 27.0 |
887 | 19.0 | 19.0 |
888 | 28.0 | NaN |
889 | 26.0 | 26.0 |
890 | 32.0 | 32.0 |
891 rows × 2 columns
### 3. MOde Imputation Technique -- categorical values
df_titan[df_titan['embarked'].isnull()]
survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | Age_mean | Age_median | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
61 | 1 | 1 | female | 38.0 | 0 | 0 | 80.0 | NaN | First | woman | False | B | NaN | yes | True | 38.0 | 38.0 |
829 | 1 | 1 | female | 62.0 | 0 | 0 | 80.0 | NaN | First | woman | False | B | NaN | yes | True | 62.0 | 62.0 |
df_titan['embarked'].unique()
array(['S', 'C', 'Q', nan], dtype=object)
df_titan['embarked'].notna()
mode_value=df_titan[df_titan['embarked'].notna()]['embarked'].mode()[0]
df_titan['embarked_mode']=df_titan['embarked'].fillna(mode_value)
df_titan[['embarked_mode','embarked']]
embarked_mode | embarked | |
---|---|---|
0 | S | S |
1 | C | C |
2 | S | S |
3 | S | S |
4 | S | S |
... | ... | ... |
886 | S | S |
887 | S | S |
888 | S | S |
889 | C | C |
890 | Q | Q |
891 rows × 2 columns
df_titan['embarked_mode'].isnull().sum()
np.int64(0)
df_titan['embarked'].isnull().sum()
np.int64(2)
e.g. trying to solve a data set - output is categories - these are binary classification - out of 1000 data points 900 yes & 100 no - this is the example of imblance data set - this is a problem, where this will become biased while predecion
HOw to transform is
- up sampling
- Down smalping
###
import numpy as np
import pandas as pd
## set the random seed with two classes
np.random.seed(123)
# create a dataframe
n_samples=1000
class_0_ratios=0.9
n_class_0=int(n_samples * class_0_ratios)
n_class_1=n_samples - n_class_0
n_class_0,n_class_1
(900, 100)
## create data set with imblance data sets
class_0 = pd.DataFrame({
'feature_1':np.random.normal(loc=0, scale=1, size=n_class_0),
'feature_2':np.random.normal(loc=0, scale=1, size=n_class_0),
'target':[0] * n_class_0
})
class_1 = pd.DataFrame({
'feature_1':np.random.normal(loc=2, scale=1, size=n_class_1),
'feature_2':np.random.normal(loc=2, scale=1, size=n_class_1),
'target':[1] * n_class_1
})
df_class10=pd.concat([class_0,class_1]).reset_index(drop=True)
df_class10.tail()
feature_1 | feature_2 | target | |
---|---|---|---|
995 | 1.376371 | 2.845701 | 1 |
996 | 2.239810 | 0.880077 | 1 |
997 | 1.131760 | 1.640703 | 1 |
998 | 2.902006 | 0.390305 | 1 |
999 | 2.697490 | 2.013570 | 1 |
df_class10['target'].value_counts()
target 0 900 1 100 Name: count, dtype: int64
##### up sampling
df_class10_minority=df_class10[df_class10['target']==1]
df_class10_majority=df_class10[df_class10['target']==0]
from sklearn.utils import resample
df_class10_minority_upsampled = resample(df_class10_minority,replace=True,
n_samples=len(df_class10_majority),
random_state=42
)
df_class10_minority_upsampled.shape
(900, 3)
df_class10_minority_upsampled.head()
feature_1 | feature_2 | target | |
---|---|---|---|
951 | 1.125854 | 1.843917 | 1 |
992 | 2.196570 | 1.397425 | 1 |
914 | 1.932170 | 2.998053 | 1 |
971 | 2.272825 | 3.034197 | 1 |
960 | 2.870056 | 1.550485 | 1 |
df_upsampled=pd.concat([df_class10_majority,df_class10_minority_upsampled])
df_upsampled['target'].value_counts()
target 0 900 1 900 Name: count, dtype: int64
### Down sampling
df_class10_majority_downsampled = resample(df_class10_majority,replace=False,
n_samples=len(df_class10_minority),
random_state=42
)
df_class10_majority_downsampled.shape
(100, 3)
df_downsampled=pd.concat([df_class10_minority,df_class10_majority_downsampled])
df_downsampled['target'].value_counts()
target 1 100 0 100 Name: count, dtype: int64
from sklearn.datasets import make_classification
x,y=make_classification(n_samples=1000,
n_redundant=0,
n_features=2,
n_clusters_per_class=1,
weights=[.90],
random_state=12)
import pandas as pd
df1_smote=pd.DataFrame(x,columns=['f1','f2'])
df2_smote=pd.DataFrame(y,columns=['target'])
final_df_smote=pd.concat([df1_smote,df2_smote],axis=1)
final_df_smote.head(5)
f1 | f2 | target | |
---|---|---|---|
0 | -0.762898 | -0.706808 | 0 |
1 | -1.075436 | -1.051162 | 0 |
2 | -0.610115 | -0.909802 | 0 |
3 | -2.023284 | -0.428945 | 1 |
4 | -0.812921 | -1.316206 | 0 |
final_df_smote['target'].value_counts()
target 0 900 1 100 Name: count, dtype: int64
import matplotlib.pyplot as plt
plt.scatter(final_df_smote['f1'],final_df_smote['f2'],c=final_df_smote['target'])
<matplotlib.collections.PathCollection at 0x1384c119950>
from imblearn.over_sampling import SMOTE
## transform the data set
oversample1=SMOTE()
x,y=oversample1.fit_resample(final_df_smote[['f1','f2']],final_df_smote['target'])
x.shape
(1800, 2)
y.shape
(1800,)
y[y==0]
0 0 1 0 2 0 4 0 5 0 .. 995 0 996 0 997 0 998 0 999 0 Name: target, Length: 900, dtype: int64
len(y[y==0])
900
len(y[y==1])
900
df1_smote=pd.DataFrame(x,columns=['f1','f2'])
df2_smote=pd.DataFrame(y,columns=['target'])
oversample1_df=pd.concat([df1_smote,df2_smote],axis=1)
plt.scatter(oversample1_df['f1'],oversample1_df['f2'],c=oversample1_df['target'])
<matplotlib.collections.PathCollection at 0x1384c1a9090>
## minimum value, Maximum, Midean, Q1,Q3, IQR
import numpy as np
lst_marks1=[45,35,46,56,74,89,54,32,89,90,87,67,54,45,98,89,67,74]
minimum1,Q1,median1,Q3,maxmimum1=np.quantile(lst_marks1,[0,0.25,0.50,0.75,1.0])
minimum1,Q1,median1,Q3,maxmimum1
(np.float64(32.0), np.float64(48.0), np.float64(67.0), np.float64(88.5), np.float64(98.0))
IQR_1=Q3-Q1
print(IQR_1)
40.5
lower_fence1=Q1-1.5*(IQR_1)
higher_fence1=Q3+1.5*(IQR_1)
print(lower_fence1)
print(higher_fence1)
-12.75 149.25
lst_marks1=[45,35,46,56,74,89,54,32,89,90,87,67,54,45,98,89,67,74]
import seaborn as sns
sns.boxplot(lst_marks1)
<Axes: >
lst_marks1=[45,35,46,56,74,89,54,32,89,90,87,67,54,45,98,89,67,74]
E.g: - Experience, Degree, salary Degree = BE, Phd ,Master = these are catergorial variables, the model wont be able to understand. These categorical variables in to some meaningful numerical values this process is called DATA ENCODING
Nomial/ OHE Encoding : one hot encoding
- e.g:
Color = [Red, green, blue, etc]
ONE encodnig 1 0 0
0 1 0
0 0 1
Sparse matrix, when there are more more categories
Red=[1,0,0] Green=[0,1,0] Blue=[0,0,1]
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
df_one_hot=pd.DataFrame({
'color':['red','blue','green','green','red','blue']
})
df_one_hot.head()
color | |
---|---|
0 | red |
1 | blue |
2 | green |
3 | green |
4 | red |
### using ONE HOT ENCODING
encoder_one_hot=OneHotEncoder()
### perfrom fit and trandsform
encoded01_one_hot= encoder_one_hot.fit_transform(df_one_hot[['color']]).toarray()
'''
<Compressed Sparse Row sparse matrix of dtype 'float64'
with 6 stored elements and shape (6, 3)>
'''
encoded01_one_hot
array([[0., 0., 1.], [1., 0., 0.], [0., 1., 0.], [0., 1., 0.], [0., 0., 1.], [1., 0., 0.]])
encoder_one_hot_df=pd.DataFrame(encoded01_one_hot,columns=encoder_one_hot.get_feature_names_out())
encoder_one_hot_df
color_blue | color_green | color_red | |
---|---|---|---|
0 | 0.0 | 0.0 | 1.0 |
1 | 1.0 | 0.0 | 0.0 |
2 | 0.0 | 1.0 | 0.0 |
3 | 0.0 | 1.0 | 0.0 |
4 | 0.0 | 0.0 | 1.0 |
5 | 1.0 | 0.0 | 0.0 |
## for new data when added
encoder_one_hot.transform([['blue']]).toarray()
array([[1., 0., 0.]])
pd.concat([df_one_hot,encoder_one_hot_df],axis=1)
color | color_blue | color_green | color_red | |
---|---|---|---|---|
0 | red | 0.0 | 0.0 | 1.0 |
1 | blue | 1.0 | 0.0 | 0.0 |
2 | green | 0.0 | 1.0 | 0.0 |
3 | green | 0.0 | 1.0 | 0.0 |
4 | red | 0.0 | 0.0 | 1.0 |
5 | blue | 1.0 | 0.0 | 0.0 |
from sklearn.preprocessing import LabelEncoder
lbl_encoder1=LabelEncoder()
df_one_hot.head
<bound method NDFrame.head of color 0 red 1 blue 2 green 3 green 4 red 5 blue>
lbl_encoder1.fit_transform(df_one_hot[['color']])
array([2, 0, 1, 1, 2, 0])
lbl_encoder1.transform([['red']])
array([2])
lbl_encoder1.transform([['green']])
array([1])
lbl_encoder1.transform([['blue']])
array([0])
where the data has some ranking or intrinsic order
e.g: high school = 1 college = 2 Graduate = 3 Post-graduate = 4
### ordinal encodoing
from sklearn.preprocessing import OrdinalEncoder
df_ordinal=pd.DataFrame({
'size':['small','medium','large','medium','small','large']
})
df_ordinal
size | |
---|---|
0 | small |
1 | medium |
2 | large |
3 | medium |
4 | small |
5 | large |
### create an instance of ordinal encoder then perfrom fit_transform
encoder_ordin=OrdinalEncoder(categories=[['small','medium','large']])
encoder_ordin.fit_transform(df_ordinal[['size']])
array([[0.], [1.], [2.], [1.], [0.], [2.]])
encoder_ordin.transform([['small']])
array([[0.]])
### create categorical variables
df_data_target1= pd.DataFrame({
'city':['new york','london','paris','tokyo', 'new york','paris'],
'price':[200,150,300,250,180,320]
})
df_data_target1
city | price | |
---|---|---|
0 | new york | 200 |
1 | london | 150 |
2 | paris | 300 |
3 | tokyo | 250 |
4 | new york | 180 |
5 | paris | 320 |
mean_price_target1=df_data_target1.groupby('city')['price'].mean().to_dict()
mean_price_target1
{'london': 150.0, 'new york': 190.0, 'paris': 310.0, 'tokyo': 250.0}
df_data_target1['city_encoded']=df_data_target1['city'].map(mean_price_target1)
df_data_target1
city | price | city_encoded | |
---|---|---|---|
0 | new york | 200 | 190.0 |
1 | london | 150 | 150.0 |
2 | paris | 300 | 310.0 |
3 | tokyo | 250 | 250.0 |
4 | new york | 180 | 190.0 |
5 | paris | 320 | 310.0 |
df_data_target1[['price','city_encoded']]
price | city_encoded | |
---|---|---|
0 | 200 | 190.0 |
1 | 150 | 150.0 |
2 | 300 | 310.0 |
3 | 250 | 250.0 |
4 | 180 | 190.0 |
5 | 320 | 310.0 |
''' https://archive.ics.uci.edu/dataset/186/wine+quality '''
import pandas as pd
df_redwine=pd.read_csv('winequality-red.csv')
df_redwine.head()
fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality" | |
---|---|
0 | 7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5 |
1 | 7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5 |
2 | 7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;... |
3 | 11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58... |
4 | 7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5 |
from ucimlrepo import fetch_ucirepo
wine_quality= fetch_ucirepo(id=186)
wine_quality
{'data': {'ids': None, 'features': fixed_acidity volatile_acidity citric_acid residual_sugar chlorides \ 0 7.4 0.70 0.00 1.9 0.076 1 7.8 0.88 0.00 2.6 0.098 2 7.8 0.76 0.04 2.3 0.092 3 11.2 0.28 0.56 1.9 0.075 4 7.4 0.70 0.00 1.9 0.076 ... ... ... ... ... ... 6492 6.2 0.21 0.29 1.6 0.039 6493 6.6 0.32 0.36 8.0 0.047 6494 6.5 0.24 0.19 1.2 0.041 6495 5.5 0.29 0.30 1.1 0.022 6496 6.0 0.21 0.38 0.8 0.020 free_sulfur_dioxide total_sulfur_dioxide density pH sulphates \ 0 11.0 34.0 0.99780 3.51 0.56 1 25.0 67.0 0.99680 3.20 0.68 2 15.0 54.0 0.99700 3.26 0.65 3 17.0 60.0 0.99800 3.16 0.58 4 11.0 34.0 0.99780 3.51 0.56 ... ... ... ... ... ... 6492 24.0 92.0 0.99114 3.27 0.50 6493 57.0 168.0 0.99490 3.15 0.46 6494 30.0 111.0 0.99254 2.99 0.46 6495 20.0 110.0 0.98869 3.34 0.38 6496 22.0 98.0 0.98941 3.26 0.32 alcohol 0 9.4 1 9.8 2 9.8 3 9.8 4 9.4 ... ... 6492 11.2 6493 9.6 6494 9.4 6495 12.8 6496 11.8 [6497 rows x 11 columns], 'targets': quality 0 5 1 5 2 5 3 6 4 5 ... ... 6492 6 6493 5 6494 6 6495 7 6496 6 [6497 rows x 1 columns], 'original': fixed_acidity volatile_acidity citric_acid residual_sugar chlorides \ 0 7.4 0.70 0.00 1.9 0.076 1 7.8 0.88 0.00 2.6 0.098 2 7.8 0.76 0.04 2.3 0.092 3 11.2 0.28 0.56 1.9 0.075 4 7.4 0.70 0.00 1.9 0.076 ... ... ... ... ... ... 6492 6.2 0.21 0.29 1.6 0.039 6493 6.6 0.32 0.36 8.0 0.047 6494 6.5 0.24 0.19 1.2 0.041 6495 5.5 0.29 0.30 1.1 0.022 6496 6.0 0.21 0.38 0.8 0.020 free_sulfur_dioxide total_sulfur_dioxide density pH sulphates \ 0 11.0 34.0 0.99780 3.51 0.56 1 25.0 67.0 0.99680 3.20 0.68 2 15.0 54.0 0.99700 3.26 0.65 3 17.0 60.0 0.99800 3.16 0.58 4 11.0 34.0 0.99780 3.51 0.56 ... ... ... ... ... ... 6492 24.0 92.0 0.99114 3.27 0.50 6493 57.0 168.0 0.99490 3.15 0.46 6494 30.0 111.0 0.99254 2.99 0.46 6495 20.0 110.0 0.98869 3.34 0.38 6496 22.0 98.0 0.98941 3.26 0.32 alcohol quality color 0 9.4 5 red 1 9.8 5 red 2 9.8 5 red 3 9.8 6 red 4 9.4 5 red ... ... ... ... 6492 11.2 6 white 6493 9.6 5 white 6494 9.4 6 white 6495 12.8 7 white 6496 11.8 6 white [6497 rows x 13 columns], 'headers': Index(['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality', 'color'], dtype='object')}, 'metadata': {'uci_id': 186, 'name': 'Wine Quality', 'repository_url': 'https://archive.ics.uci.edu/dataset/186/wine+quality', 'data_url': 'https://archive.ics.uci.edu/static/public/186/data.csv', 'abstract': 'Two datasets are included, related to red and white vinho verde wine samples, from the north of Portugal. The goal is to model wine quality based on physicochemical tests (see [Cortez et al., 2009], http://www3.dsi.uminho.pt/pcortez/wine/).', 'area': 'Business', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 4898, 'num_features': 11, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['quality'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2009, 'last_updated': 'Wed Nov 15 2023', 'dataset_doi': '10.24432/C56S3T', 'creators': ['Paulo Cortez', 'A. Cerdeira', 'F. Almeida', 'T. Matos', 'J. Reis'], 'intro_paper': {'ID': 252, 'type': 'NATIVE', 'title': 'Modeling wine preferences by data mining from physicochemical properties', 'authors': 'P. Cortez, A. Cerdeira, Fernando Almeida, Telmo Matos, J. Reis', 'venue': 'Decision Support Systems', 'year': 2009, 'journal': None, 'DOI': None, 'URL': 'https://www.semanticscholar.org/paper/Modeling-wine-preferences-by-data-mining-from-Cortez-Cerdeira/bf15a0ccc14ac1deb5cea570c870389c16be019c', 'sha': None, 'corpus': None, 'arxiv': None, 'mag': None, 'acl': None, 'pmid': None, 'pmcid': None}, 'additional_info': {'summary': 'The two datasets are related to red and white variants of the Portuguese "Vinho Verde" wine. For more details, consult: http://www.vinhoverde.pt/en/ or the reference [Cortez et al., 2009]. Due to privacy and logistic issues, only physicochemical (inputs) and sensory (the output) variables are available (e.g. there is no data about grape types, wine brand, wine selling price, etc.).\n\nThese datasets can be viewed as classification or regression tasks. The classes are ordered and not balanced (e.g. there are many more normal wines than excellent or poor ones). Outlier detection algorithms could be used to detect the few excellent or poor wines. Also, we are not sure if all input variables are relevant. So it could be interesting to test feature selection methods.\n', 'purpose': None, 'funded_by': None, 'instances_represent': None, 'recommended_data_splits': None, 'sensitive_data': None, 'preprocessing_description': None, 'variable_info': 'For more information, read [Cortez et al., 2009].\r\nInput variables (based on physicochemical tests):\r\n 1 - fixed acidity\r\n 2 - volatile acidity\r\n 3 - citric acid\r\n 4 - residual sugar\r\n 5 - chlorides\r\n 6 - free sulfur dioxide\r\n 7 - total sulfur dioxide\r\n 8 - density\r\n 9 - pH\r\n 10 - sulphates\r\n 11 - alcohol\r\nOutput variable (based on sensory data): \r\n 12 - quality (score between 0 and 10)', 'citation': None}}, 'variables': name role type demographic \ 0 fixed_acidity Feature Continuous None 1 volatile_acidity Feature Continuous None 2 citric_acid Feature Continuous None 3 residual_sugar Feature Continuous None 4 chlorides Feature Continuous None 5 free_sulfur_dioxide Feature Continuous None 6 total_sulfur_dioxide Feature Continuous None 7 density Feature Continuous None 8 pH Feature Continuous None 9 sulphates Feature Continuous None 10 alcohol Feature Continuous None 11 quality Target Integer None 12 color Other Categorical None description units missing_values 0 None None no 1 None None no 2 None None no 3 None None no 4 None None no 5 None None no 6 None None no 7 None None no 8 None None no 9 None None no 10 None None no 11 score between 0 and 10 None no 12 red or white None no }
df_wine_quality = wine_quality.data.features
df_wine_quality.head()
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 |
1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 |
2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 |
3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 |
4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 |
wine_quality.data.targets
quality | |
---|---|
0 | 5 |
1 | 5 |
2 | 5 |
3 | 6 |
4 | 5 |
... | ... |
6492 | 6 |
6493 | 5 |
6494 | 6 |
6495 | 7 |
6496 | 6 |
6497 rows × 1 columns
df_wine_quality.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6497 entries, 0 to 6496 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed_acidity 6497 non-null float64 1 volatile_acidity 6497 non-null float64 2 citric_acid 6497 non-null float64 3 residual_sugar 6497 non-null float64 4 chlorides 6497 non-null float64 5 free_sulfur_dioxide 6497 non-null float64 6 total_sulfur_dioxide 6497 non-null float64 7 density 6497 non-null float64 8 pH 6497 non-null float64 9 sulphates 6497 non-null float64 10 alcohol 6497 non-null float64 dtypes: float64(11) memory usage: 558.5 KB
##descriptive summary
df_1_wine_quality=pd.DataFrame(df_wine_quality)
df_1_wine_quality.describe()
#### adding the quality column to the data frame table to gether
df_1_wine_quality['quality']=wine_quality.data.targets
df_1_wine_quality.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6497 entries, 0 to 6496 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed_acidity 6497 non-null float64 1 volatile_acidity 6497 non-null float64 2 citric_acid 6497 non-null float64 3 residual_sugar 6497 non-null float64 4 chlorides 6497 non-null float64 5 free_sulfur_dioxide 6497 non-null float64 6 total_sulfur_dioxide 6497 non-null float64 7 density 6497 non-null float64 8 pH 6497 non-null float64 9 sulphates 6497 non-null float64 10 alcohol 6497 non-null float64 11 quality 6497 non-null int64 dtypes: float64(11), int64(1) memory usage: 609.2 KB
#### nos of columns and rows
df_1_wine_quality.shape
(6497, 12)
## column names
df_1_wine_quality.columns
Index(['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality'], dtype='object')
df_1_wine_quality['quality'].unique()
array([5, 6, 7, 4, 8, 3, 9])
## missing values indata set
df_1_wine_quality.isnull().sum()
fixed_acidity 0 volatile_acidity 0 citric_acid 0 residual_sugar 0 chlorides 0 free_sulfur_dioxide 0 total_sulfur_dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 dtype: int64
### duplicate records
df_1_wine_quality.duplicated() ## where ever there is duplicate it will be true
df_1_wine_quality[df_1_wine_quality.duplicated()]
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
4 | 7.4 | 0.700 | 0.00 | 1.90 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.400000 | 5 |
11 | 7.5 | 0.500 | 0.36 | 6.10 | 0.071 | 17.0 | 102.0 | 0.99780 | 3.35 | 0.80 | 10.500000 | 5 |
27 | 7.9 | 0.430 | 0.21 | 1.60 | 0.106 | 10.0 | 37.0 | 0.99660 | 3.17 | 0.91 | 9.500000 | 5 |
40 | 7.3 | 0.450 | 0.36 | 5.90 | 0.074 | 12.0 | 87.0 | 0.99780 | 3.33 | 0.83 | 10.500000 | 5 |
65 | 7.2 | 0.725 | 0.05 | 4.65 | 0.086 | 4.0 | 11.0 | 0.99620 | 3.41 | 0.39 | 10.900000 | 5 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6427 | 6.4 | 0.230 | 0.35 | 10.30 | 0.042 | 54.0 | 140.0 | 0.99670 | 3.23 | 0.47 | 9.200000 | 5 |
6449 | 7.0 | 0.360 | 0.35 | 2.50 | 0.048 | 67.0 | 161.0 | 0.99146 | 3.05 | 0.56 | 11.100000 | 6 |
6450 | 6.4 | 0.330 | 0.44 | 8.90 | 0.055 | 52.0 | 164.0 | 0.99488 | 3.10 | 0.48 | 9.600000 | 5 |
6455 | 7.1 | 0.230 | 0.39 | 13.70 | 0.058 | 26.0 | 172.0 | 0.99755 | 2.90 | 0.46 | 9.000000 | 6 |
6479 | 6.6 | 0.340 | 0.40 | 8.10 | 0.046 | 68.0 | 170.0 | 0.99494 | 3.15 | 0.50 | 9.533333 | 6 |
1179 rows × 12 columns
### removing the dplicates
df_1_wine_quality.drop_duplicates(inplace=True)
df_1_wine_quality.shape
(5318, 12)
df_1_wine_quality.corr()
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
fixed_acidity | 1.000000 | 0.214595 | 0.330225 | -0.104641 | 0.288843 | -0.281375 | -0.327462 | 0.478148 | -0.271094 | 0.304728 | -0.102696 | -0.080190 |
volatile_acidity | 0.214595 | 1.000000 | -0.384395 | -0.163896 | 0.367573 | -0.348865 | -0.400715 | 0.308375 | 0.246837 | 0.227645 | -0.065510 | -0.265354 |
citric_acid | 0.330225 | -0.384395 | 1.000000 | 0.146078 | 0.055107 | 0.131113 | 0.194888 | 0.094663 | -0.344675 | 0.059061 | -0.005572 | 0.097926 |
residual_sugar | -0.104641 | -0.163896 | 0.146078 | 1.000000 | -0.123302 | 0.399090 | 0.487593 | 0.520867 | -0.234443 | -0.174946 | -0.305334 | -0.056824 |
chlorides | 0.288843 | 0.367573 | 0.055107 | -0.123302 | 1.000000 | -0.186518 | -0.269896 | 0.371665 | 0.025914 | 0.404905 | -0.269516 | -0.201960 |
free_sulfur_dioxide | -0.281375 | -0.348865 | 0.131113 | 0.399090 | -0.186518 | 1.000000 | 0.720576 | 0.006360 | -0.141968 | -0.198075 | -0.169887 | 0.054199 |
total_sulfur_dioxide | -0.327462 | -0.400715 | 0.194888 | 0.487593 | -0.269896 | 0.720576 | 1.000000 | 0.006677 | -0.223000 | -0.275877 | -0.248942 | -0.050183 |
density | 0.478148 | 0.308375 | 0.094663 | 0.520867 | 0.371665 | 0.006360 | 0.006677 | 1.000000 | 0.034377 | 0.282494 | -0.667861 | -0.326301 |
pH | -0.271094 | 0.246837 | -0.344675 | -0.234443 | 0.025914 | -0.141968 | -0.223000 | 0.034377 | 1.000000 | 0.168287 | 0.097392 | 0.039789 |
sulphates | 0.304728 | 0.227645 | 0.059061 | -0.174946 | 0.404905 | -0.198075 | -0.275877 | 0.282494 | 0.168287 | 1.000000 | -0.017166 | 0.042040 |
alcohol | -0.102696 | -0.065510 | -0.005572 | -0.305334 | -0.269516 | -0.169887 | -0.248942 | -0.667861 | 0.097392 | -0.017166 | 1.000000 | 0.469216 |
quality | -0.080190 | -0.265354 | 0.097926 | -0.056824 | -0.201960 | 0.054199 | -0.050183 | -0.326301 | 0.039789 | 0.042040 | 0.469216 | 1.000000 |
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10,6))
sns.heatmap(df_1_wine_quality.corr(),annot=True)
<Axes: >
###visualization
### conclusion is that its a imbalance dataset
df_1_wine_quality.quality.value_counts()
quality 6 2323 5 1751 7 855 4 206 8 148 3 30 9 5 Name: count, dtype: int64
df_1_wine_quality.quality.value_counts().plot(kind='bar')
plt.xlabel("wine Quality")
plt.ylabel("count")
plt.title("Wine quality data counts")
plt.show()
df_1_wine_quality.head()
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 |
2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 |
5 | 7.4 | 0.66 | 0.00 | 1.8 | 0.075 | 13.0 | 40.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
for column in df_1_wine_quality.columns:
sns.histplot(df_1_wine_quality[column],kde=True)
sns.histplot(df_1_wine_quality['alcohol'],kde=True)
<Axes: xlabel='alcohol', ylabel='Count'>
#### uni variant, bi variant, multi variant
sns.pairplot(df_1_wine_quality)
<seaborn.axisgrid.PairGrid at 0x13849d168b0>
### categorical plot
sns.catplot(x='quality',y='alcohol',data=df_1_wine_quality,kind='box')
2025-07-14 07:00:07,720 | INFO | category.py:224 | update | Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. 2025-07-14 07:00:07,727 | INFO | category.py:224 | update | Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
<seaborn.axisgrid.FacetGrid at 0x138557ae350>
sns.scatterplot(x='alcohol',y='pH',hue='quality',data=df_1_wine_quality)
<Axes: xlabel='alcohol', ylabel='pH'>
https://www.kaggle.com/datasets/shubhambathwal/flight-price-prediction
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#df_flight_clean=pd.read_csv('flight_Clean_Dataset.csv')
#df_flight_business=pd.read_csv('flight_business.csv')
#df_flight_economy=pd.read_csv('flight_economy.csv')
df_flight_price=pd.read_excel('flight_price.xlsx')
# df_flight_clean.head()
# df_flight_business.head()
# df_flight_economy.head()
df_flight_price.head()
Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | 24/03/2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 |
1 | Air India | 1/05/2019 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 |
2 | Jet Airways | 9/06/2019 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 stops | No info | 13882 |
3 | IndiGo | 12/05/2019 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 |
4 | IndiGo | 01/03/2019 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 |
### get the basic info for the data set
df_flight_price.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10683 entries, 0 to 10682 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10683 non-null object 1 Date_of_Journey 10683 non-null object 2 Source 10683 non-null object 3 Destination 10683 non-null object 4 Route 10682 non-null object 5 Dep_Time 10683 non-null object 6 Arrival_Time 10683 non-null object 7 Duration 10683 non-null object 8 Total_Stops 10682 non-null object 9 Additional_Info 10683 non-null object 10 Price 10683 non-null int64 dtypes: int64(1), object(10) memory usage: 918.2+ KB
df_flight_price.describe()
Price | |
---|---|
count | 10683.000000 |
mean | 9087.064121 |
std | 4611.359167 |
min | 1759.000000 |
25% | 5277.000000 |
50% | 8372.000000 |
75% | 12373.000000 |
max | 79512.000000 |
### feature Enginering process
#converting the date field
df_flight_price['Date_converted']=df_flight_price['Date_of_Journey'].str.split('/').str[0]
df_flight_price['Month_converted']=df_flight_price['Date_of_Journey'].str.split('/').str[1]
df_flight_price['Year_converted']=df_flight_price['Date_of_Journey'].str.split('/').str[2]
df_flight_price.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10683 entries, 0 to 10682 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10683 non-null object 1 Date_of_Journey 10683 non-null object 2 Source 10683 non-null object 3 Destination 10683 non-null object 4 Route 10682 non-null object 5 Dep_Time 10683 non-null object 6 Arrival_Time 10683 non-null object 7 Duration 10683 non-null object 8 Total_Stops 10682 non-null object 9 Additional_Info 10683 non-null object 10 Price 10683 non-null int64 11 Date_converted 10683 non-null object 12 Month_converted 10683 non-null object 13 Year_converted 10683 non-null object dtypes: int64(1), object(13) memory usage: 1.1+ MB
### converting the date month year colum in to INT type
df_flight_price['Date_converted']=df_flight_price['Date_converted'].astype(int)
df_flight_price['Month_converted']=df_flight_price['Month_converted'].astype(int)
df_flight_price['Year_converted']=df_flight_price['Year_converted'].astype(int)
df_flight_price.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10683 entries, 0 to 10682 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10683 non-null object 1 Date_of_Journey 10683 non-null object 2 Source 10683 non-null object 3 Destination 10683 non-null object 4 Route 10682 non-null object 5 Dep_Time 10683 non-null object 6 Arrival_Time 10683 non-null object 7 Duration 10683 non-null object 8 Total_Stops 10682 non-null object 9 Additional_Info 10683 non-null object 10 Price 10683 non-null int64 11 Date_converted 10683 non-null int64 12 Month_converted 10683 non-null int64 13 Year_converted 10683 non-null int64 dtypes: int64(4), object(10) memory usage: 1.1+ MB
### dropping date of journey column
df_flight_price.drop('Date_of_Journey',axis=1,inplace=True)
df_flight_price.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10683 entries, 0 to 10682 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10683 non-null object 1 Source 10683 non-null object 2 Destination 10683 non-null object 3 Route 10682 non-null object 4 Dep_Time 10683 non-null object 5 Arrival_Time 10683 non-null object 6 Duration 10683 non-null object 7 Total_Stops 10682 non-null object 8 Additional_Info 10683 non-null object 9 Price 10683 non-null int64 10 Date_converted 10683 non-null int64 11 Month_converted 10683 non-null int64 12 Year_converted 10683 non-null int64 dtypes: int64(4), object(9) memory usage: 1.1+ MB
df_flight_price.head(2)
Airline | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | Date_converted | Month_converted | Year_converted | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 | 24 | 3 | 2019 |
1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 | 1 | 5 | 2019 |
##### just capturing the values before the first empty space from the arival time to exclude the other values
df_flight_price['Arrival_Time']=df_flight_price['Arrival_Time'].apply(lambda x:x.split(' ')[0])
df_flight_price['Arrival_Time']
0 01:10 1 13:15 2 04:25 3 23:30 4 21:35 ... 10678 22:25 10679 23:20 10680 11:20 10681 14:10 10682 19:15 Name: Arrival_Time, Length: 10683, dtype: object
df_flight_price['Arrival_Hour_converted']=df_flight_price['Arrival_Time'].str.split(':').str[0]
df_flight_price['Arrival_Minutes_converted']=df_flight_price['Arrival_Time'].str.split(':').str[1]
df_flight_price.head(2)
Airline | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | Date_converted | Month_converted | Year_converted | Arrival_Hour_converted | Arrival_Minutes_converted | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 | 2h 50m | non-stop | No info | 3897 | 24 | 3 | 2019 | 01 | 10 |
1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 | 1 | 5 | 2019 | 13 | 15 |
df_flight_price['Arrival_Hour_converted']=df_flight_price['Arrival_Hour_converted'].astype(int)
df_flight_price['Arrival_Minutes_converted']=df_flight_price['Arrival_Minutes_converted'].astype(int)
df_flight_price.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10683 entries, 0 to 10682 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10683 non-null object 1 Source 10683 non-null object 2 Destination 10683 non-null object 3 Route 10682 non-null object 4 Dep_Time 10683 non-null object 5 Arrival_Time 10683 non-null object 6 Duration 10683 non-null object 7 Total_Stops 10682 non-null object 8 Additional_Info 10683 non-null object 9 Price 10683 non-null int64 10 Date_converted 10683 non-null int64 11 Month_converted 10683 non-null int64 12 Year_converted 10683 non-null int64 13 Arrival_Hour_converted 10683 non-null int64 14 Arrival_Minutes_converted 10683 non-null int64 dtypes: int64(6), object(9) memory usage: 1.2+ MB
df_flight_price.drop('Arrival_Time',axis=1,inplace=True)
df_flight_price.head(2)
Airline | Source | Destination | Route | Dep_Time | Duration | Total_Stops | Additional_Info | Price | Date_converted | Month_converted | Year_converted | Arrival_Hour_converted | Arrival_Minutes_converted | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | BLR → DEL | 22:20 | 2h 50m | non-stop | No info | 3897 | 24 | 3 | 2019 | 1 | 10 |
1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 7h 25m | 2 stops | No info | 7662 | 1 | 5 | 2019 | 13 | 15 |
df_flight_price['Dep_Time_Hour_converted']=df_flight_price['Dep_Time'].str.split(':').str[0]
df_flight_price['Dep_Time_Minutes_converted']=df_flight_price['Dep_Time'].str.split(':').str[1]
df_flight_price.head(2)
Airline | Source | Destination | Route | Dep_Time | Duration | Total_Stops | Additional_Info | Price | Date_converted | Month_converted | Year_converted | Arrival_Hour_converted | Arrival_Minutes_converted | Dep_Time_Hour_converted | Dep_Time_Minutes_converted | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | BLR → DEL | 22:20 | 2h 50m | non-stop | No info | 3897 | 24 | 3 | 2019 | 1 | 10 | 22 | 20 |
1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 7h 25m | 2 stops | No info | 7662 | 1 | 5 | 2019 | 13 | 15 | 05 | 50 |
### converting to integer
df_flight_price['Dep_Time_Hour_converted']=df_flight_price['Dep_Time_Hour_converted'].astype(int)
df_flight_price['Dep_Time_Minutes_converted']=df_flight_price['Dep_Time_Minutes_converted'].astype(int)
df_flight_price.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10683 entries, 0 to 10682 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10683 non-null object 1 Source 10683 non-null object 2 Destination 10683 non-null object 3 Route 10682 non-null object 4 Dep_Time 10683 non-null object 5 Duration 10683 non-null object 6 Total_Stops 10682 non-null object 7 Additional_Info 10683 non-null object 8 Price 10683 non-null int64 9 Date_converted 10683 non-null int64 10 Month_converted 10683 non-null int64 11 Year_converted 10683 non-null int64 12 Arrival_Hour_converted 10683 non-null int64 13 Arrival_Minutes_converted 10683 non-null int64 14 Dep_Time_Hour_converted 10683 non-null int64 15 Dep_Time_Minutes_converted 10683 non-null int64 dtypes: int64(8), object(8) memory usage: 1.3+ MB
df_flight_price.drop('Dep_Time',axis=1,inplace=True)
df_flight_price.head(2)
Airline | Source | Destination | Route | Duration | Total_Stops | Additional_Info | Price | Date_converted | Month_converted | Year_converted | Arrival_Hour_converted | Arrival_Minutes_converted | Dep_Time_Hour_converted | Dep_Time_Minutes_converted | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | BLR → DEL | 2h 50m | non-stop | No info | 3897 | 24 | 3 | 2019 | 1 | 10 | 22 | 20 |
1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 7h 25m | 2 stops | No info | 7662 | 1 | 5 | 2019 | 13 | 15 | 5 | 50 |
df_flight_price['Total_Stops'].unique()
array(['non-stop', '2 stops', '1 stop', '3 stops', nan, '4 stops'], dtype=object)
df_flight_price['Total_Stops'].isnull().sum()
np.int64(1)
df_flight_price[df_flight_price['Total_Stops'].isnull()]
Airline | Source | Destination | Route | Duration | Total_Stops | Additional_Info | Price | Date_converted | Month_converted | Year_converted | Arrival_Hour_converted | Arrival_Minutes_converted | Dep_Time_Hour_converted | Dep_Time_Minutes_converted | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9039 | Air India | Delhi | Cochin | NaN | 23h 40m | NaN | No info | 7480 | 6 | 5 | 2019 | 9 | 25 | 9 | 45 |
## replacing the nan values and assigin values to categorical values. this will convert into numerical representation.
df_flight_price['Total_Stops']=df_flight_price['Total_Stops'].map({'non-stop':0, '1 stop':1, '2 stops':2, '3 stops':3, '4 stops':4,np.nan:1 })
## now all the nan values is removed
df_flight_price['Total_Stops'].isnull().sum()
np.int64(0)
df_flight_price.head(2)
Airline | Source | Destination | Route | Duration | Total_Stops | Additional_Info | Price | Date_converted | Month_converted | Year_converted | Arrival_Hour_converted | Arrival_Minutes_converted | Dep_Time_Hour_converted | Dep_Time_Minutes_converted | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | BLR → DEL | 2h 50m | 0 | No info | 3897 | 24 | 3 | 2019 | 1 | 10 | 22 | 20 |
1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 7h 25m | 2 | No info | 7662 | 1 | 5 | 2019 | 13 | 15 | 5 | 50 |
### dropping "route" column since there are source and destination columns
df_flight_price.drop('Route',axis=1,inplace=True)
df_flight_price.head(2)
Airline | Source | Destination | Duration | Total_Stops | Additional_Info | Price | Date_converted | Month_converted | Year_converted | Arrival_Hour_converted | Arrival_Minutes_converted | Dep_Time_Hour_converted | Dep_Time_Minutes_converted | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | 2h 50m | 0 | No info | 3897 | 24 | 3 | 2019 | 1 | 10 | 22 | 20 |
1 | Air India | Kolkata | Banglore | 7h 25m | 2 | No info | 7662 | 1 | 5 | 2019 | 13 | 15 | 5 | 50 |
## duration column
## first slpitting with blank
## then splitig with letter "h"
## then getting the value [0]
df_flight_price['Duration_hour_converted'] =df_flight_price['Duration'].str.split(' ').str[0].str.split('h').str[0]
df_flight_price['Duration_hour_converted']
0 2 1 7 2 19 3 5 4 4 .. 10678 2 10679 2 10680 3 10681 2 10682 8 Name: Duration_hour_converted, Length: 10683, dtype: object
df_flight_price['Duration_Minute_converted'] =df_flight_price['Duration'].str.split(' ').str[1].str.split('h').str[0].str.split('m').str[0]
df_flight_price['Duration_Minute_converted']
#df_flight_price['Duration'].str.split(' ').str[1].str.split('h').str[0].str.split('m').str[0]
0 50 1 25 2 NaN 3 25 4 45 ... 10678 30 10679 35 10680 NaN 10681 40 10682 20 Name: Duration_Minute_converted, Length: 10683, dtype: object
df_flight_price.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10683 entries, 0 to 10682 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10683 non-null object 1 Source 10683 non-null object 2 Destination 10683 non-null object 3 Duration 10683 non-null object 4 Total_Stops 10683 non-null int64 5 Additional_Info 10683 non-null object 6 Price 10683 non-null int64 7 Date_converted 10683 non-null int64 8 Month_converted 10683 non-null int64 9 Year_converted 10683 non-null int64 10 Arrival_Hour_converted 10683 non-null int64 11 Arrival_Minutes_converted 10683 non-null int64 12 Dep_Time_Hour_converted 10683 non-null int64 13 Dep_Time_Minutes_converted 10683 non-null int64 14 Duration_hour_converted 10683 non-null object 15 Duration_Minute_converted 9651 non-null object dtypes: int64(9), object(7) memory usage: 1.3+ MB
### replacing nan values as zero since its 0 minutes
df_flight_price['Duration_Minute_converted'] =df_flight_price['Duration_Minute_converted'].replace([np.nan],0)
df_flight_price['Duration_Minute_converted']
0 50 1 25 2 0 3 25 4 45 .. 10678 30 10679 35 10680 0 10681 40 10682 20 Name: Duration_Minute_converted, Length: 10683, dtype: object
df_flight_price.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10683 entries, 0 to 10682 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10683 non-null object 1 Source 10683 non-null object 2 Destination 10683 non-null object 3 Duration 10683 non-null object 4 Total_Stops 10683 non-null int64 5 Additional_Info 10683 non-null object 6 Price 10683 non-null int64 7 Date_converted 10683 non-null int64 8 Month_converted 10683 non-null int64 9 Year_converted 10683 non-null int64 10 Arrival_Hour_converted 10683 non-null int64 11 Arrival_Minutes_converted 10683 non-null int64 12 Dep_Time_Hour_converted 10683 non-null int64 13 Dep_Time_Minutes_converted 10683 non-null int64 14 Duration_hour_converted 10683 non-null object 15 Duration_Minute_converted 10683 non-null object dtypes: int64(9), object(7) memory usage: 1.3+ MB
df_flight_price.head()
Airline | Source | Destination | Duration | Total_Stops | Additional_Info | Price | Date_converted | Month_converted | Year_converted | Arrival_Hour_converted | Arrival_Minutes_converted | Dep_Time_Hour_converted | Dep_Time_Minutes_converted | Duration_hour_converted | Duration_Minute_converted | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | 2h 50m | 0 | No info | 3897 | 24 | 3 | 2019 | 1 | 10 | 22 | 20 | 2 | 50 |
1 | Air India | Kolkata | Banglore | 7h 25m | 2 | No info | 7662 | 1 | 5 | 2019 | 13 | 15 | 5 | 50 | 7 | 25 |
2 | Jet Airways | Delhi | Cochin | 19h | 2 | No info | 13882 | 9 | 6 | 2019 | 4 | 25 | 9 | 25 | 19 | 0 |
3 | IndiGo | Kolkata | Banglore | 5h 25m | 1 | No info | 6218 | 12 | 5 | 2019 | 23 | 30 | 18 | 5 | 5 | 25 |
4 | IndiGo | Banglore | New Delhi | 4h 45m | 1 | No info | 13302 | 1 | 3 | 2019 | 21 | 35 | 16 | 50 | 4 | 45 |
#df_flight_price['Duration_hour_converted']=df_flight_price['Duration_hour_converted'].astype(int)
df_flight_price['Duration_hour_converted'].unique()
array(['2', '7', '19', '5', '4', '15', '21', '25', '13', '12', '26', '22', '23', '20', '10', '6', '11', '8', '16', '3', '27', '1', '14', '9', '18', '17', '24', '30', '28', '29', '37', '34', '38', '35', '36', '47', '33', '32', '31', '42', '39', '5m', '41', '40'], dtype=object)
df_flight_price.shape
(10683, 16)
df_flight_price['Duration_hour_converted'].str.isnumeric().sum()
###(10683, 16) and (10682) there is one value that is not a number value
np.int64(10682)
df_flight_price[~df_flight_price['Duration_hour_converted'].str.isnumeric()]
Airline | Source | Destination | Duration | Total_Stops | Additional_Info | Price | Date_converted | Month_converted | Year_converted | Arrival_Hour_converted | Arrival_Minutes_converted | Dep_Time_Hour_converted | Dep_Time_Minutes_converted | Duration_hour_converted | Duration_Minute_converted | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
6474 | Air India | Mumbai | Hyderabad | 5m | 2 | No info | 17327 | 6 | 3 | 2019 | 16 | 55 | 16 | 50 | 5m | 0 |
df_flight_price_copy=df_flight_price.copy()
df_flight_price_copy
Airline | Source | Destination | Duration | Total_Stops | Additional_Info | Price | Date_converted | Month_converted | Year_converted | Arrival_Hour_converted | Arrival_Minutes_converted | Dep_Time_Hour_converted | Dep_Time_Minutes_converted | Duration_hour_converted | Duration_Minute_converted | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | 2h 50m | 0 | No info | 3897 | 24 | 3 | 2019 | 1 | 10 | 22 | 20 | 2 | 50 |
1 | Air India | Kolkata | Banglore | 7h 25m | 2 | No info | 7662 | 1 | 5 | 2019 | 13 | 15 | 5 | 50 | 7 | 25 |
2 | Jet Airways | Delhi | Cochin | 19h | 2 | No info | 13882 | 9 | 6 | 2019 | 4 | 25 | 9 | 25 | 19 | 0 |
3 | IndiGo | Kolkata | Banglore | 5h 25m | 1 | No info | 6218 | 12 | 5 | 2019 | 23 | 30 | 18 | 5 | 5 | 25 |
4 | IndiGo | Banglore | New Delhi | 4h 45m | 1 | No info | 13302 | 1 | 3 | 2019 | 21 | 35 | 16 | 50 | 4 | 45 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10678 | Air Asia | Kolkata | Banglore | 2h 30m | 0 | No info | 4107 | 9 | 4 | 2019 | 22 | 25 | 19 | 55 | 2 | 30 |
10679 | Air India | Kolkata | Banglore | 2h 35m | 0 | No info | 4145 | 27 | 4 | 2019 | 23 | 20 | 20 | 45 | 2 | 35 |
10680 | Jet Airways | Banglore | Delhi | 3h | 0 | No info | 7229 | 27 | 4 | 2019 | 11 | 20 | 8 | 20 | 3 | 0 |
10681 | Vistara | Banglore | New Delhi | 2h 40m | 0 | No info | 12648 | 1 | 3 | 2019 | 14 | 10 | 11 | 30 | 2 | 40 |
10682 | Air India | Delhi | Cochin | 8h 20m | 2 | No info | 11753 | 9 | 5 | 2019 | 19 | 15 | 10 | 55 | 8 | 20 |
10683 rows × 16 columns
df_flight_price_copy=df_flight_price_copy.drop(df_flight_price_copy.index[6474])
df_flight_price_copy[~df_flight_price_copy['Duration_hour_converted'].str.isnumeric()]
Airline | Source | Destination | Duration | Total_Stops | Additional_Info | Price | Date_converted | Month_converted | Year_converted | Arrival_Hour_converted | Arrival_Minutes_converted | Dep_Time_Hour_converted | Dep_Time_Minutes_converted | Duration_hour_converted | Duration_Minute_converted |
---|
### saving the data in new file
df_flight_price_copy.to_csv('flight_cleaned_v_01.csv')
df_flight_price['Duration_Minute_converted']=df_flight_price['Duration_Minute_converted'].astype(int)
df_flight_price.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10683 entries, 0 to 10682 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10683 non-null object 1 Source 10683 non-null object 2 Destination 10683 non-null object 3 Duration 10683 non-null object 4 Total_Stops 10683 non-null int64 5 Additional_Info 10683 non-null object 6 Price 10683 non-null int64 7 Date_converted 10683 non-null int64 8 Month_converted 10683 non-null int64 9 Year_converted 10683 non-null int64 10 Arrival_Hour_converted 10683 non-null int64 11 Arrival_Minutes_converted 10683 non-null int64 12 Dep_Time_Hour_converted 10683 non-null int64 13 Dep_Time_Minutes_converted 10683 non-null int64 14 Duration_hour_converted 10683 non-null object 15 Duration_Minute_converted 10683 non-null int64 dtypes: int64(10), object(6) memory usage: 1.3+ MB
df_flight_price['Airline'].unique()
array(['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet', 'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia', 'Vistara Premium economy', 'Jet Airways Business', 'Multiple carriers Premium economy', 'Trujet'], dtype=object)
df_flight_price['Destination'].unique()
array(['New Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Delhi', 'Hyderabad'], dtype=object)
df_flight_price['Additional_Info'].unique()
array(['No info', 'In-flight meal not included', 'No check-in baggage included', '1 Short layover', 'No Info', '1 Long layover', 'Change airports', 'Business class', 'Red-eye flight', '2 Long layover'], dtype=object)
### use oneHOtEncoder converting the categarical variables into numerical values
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder()
encoder.fit_transform(df_flight_price[['Airline','Source','Destination']]).toarray()
array([[0., 0., 0., ..., 0., 0., 1.], [0., 1., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 1.], [0., 1., 0., ..., 0., 0., 0.]], shape=(10683, 23))
pd.DataFrame(encoder.fit_transform(df_flight_price[['Airline','Source','Destination']]).toarray(),columns=encoder.get_feature_names_out())
Airline_Air Asia | Airline_Air India | Airline_GoAir | Airline_IndiGo | Airline_Jet Airways | Airline_Jet Airways Business | Airline_Multiple carriers | Airline_Multiple carriers Premium economy | Airline_SpiceJet | Airline_Trujet | ... | Source_Chennai | Source_Delhi | Source_Kolkata | Source_Mumbai | Destination_Banglore | Destination_Cochin | Destination_Delhi | Destination_Hyderabad | Destination_Kolkata | Destination_New Delhi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10678 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
10679 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
10680 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
10681 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
10682 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
10683 rows × 23 columns
### after converting the data, drop the airline, destination , additional_info column and then merge the new encoded columns above to the data set.
https://github.com/krishnaik06/playstore-Dataset/blob/main/googleplaystore.csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
df_playstore=pd.read_csv('googleplaystore.csv')
df_playstore
App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Photo Editor & Candy Camera & Grid & ScrapBook | ART_AND_DESIGN | 4.1 | 159 | 19M | 10,000+ | Free | 0 | Everyone | Art & Design | January 7, 2018 | 1.0.0 | 4.0.3 and up |
1 | Coloring book moana | ART_AND_DESIGN | 3.9 | 967 | 14M | 500,000+ | Free | 0 | Everyone | Art & Design;Pretend Play | January 15, 2018 | 2.0.0 | 4.0.3 and up |
2 | U Launcher Lite – FREE Live Cool Themes, Hide ... | ART_AND_DESIGN | 4.7 | 87510 | 8.7M | 5,000,000+ | Free | 0 | Everyone | Art & Design | August 1, 2018 | 1.2.4 | 4.0.3 and up |
3 | Sketch - Draw & Paint | ART_AND_DESIGN | 4.5 | 215644 | 25M | 50,000,000+ | Free | 0 | Teen | Art & Design | June 8, 2018 | Varies with device | 4.2 and up |
4 | Pixel Draw - Number Art Coloring Book | ART_AND_DESIGN | 4.3 | 967 | 2.8M | 100,000+ | Free | 0 | Everyone | Art & Design;Creativity | June 20, 2018 | 1.1 | 4.4 and up |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10836 | Sya9a Maroc - FR | FAMILY | 4.5 | 38 | 53M | 5,000+ | Free | 0 | Everyone | Education | July 25, 2017 | 1.48 | 4.1 and up |
10837 | Fr. Mike Schmitz Audio Teachings | FAMILY | 5.0 | 4 | 3.6M | 100+ | Free | 0 | Everyone | Education | July 6, 2018 | 1.0 | 4.1 and up |
10838 | Parkinson Exercices FR | MEDICAL | NaN | 3 | 9.5M | 1,000+ | Free | 0 | Everyone | Medical | January 20, 2017 | 1.0 | 2.2 and up |
10839 | The SCP Foundation DB fr nn5n | BOOKS_AND_REFERENCE | 4.5 | 114 | Varies with device | 1,000+ | Free | 0 | Mature 17+ | Books & Reference | January 19, 2015 | Varies with device | Varies with device |
10840 | iHoroscope - 2018 Daily Horoscope & Astrology | LIFESTYLE | 4.5 | 398307 | 19M | 10,000,000+ | Free | 0 | Everyone | Lifestyle | July 25, 2018 | Varies with device | Varies with device |
10841 rows × 13 columns
df_playstore.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10841 entries, 0 to 10840 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 App 10841 non-null object 1 Category 10841 non-null object 2 Rating 9367 non-null float64 3 Reviews 10841 non-null object 4 Size 10841 non-null object 5 Installs 10841 non-null object 6 Type 10840 non-null object 7 Price 10841 non-null object 8 Content Rating 10840 non-null object 9 Genres 10841 non-null object 10 Last Updated 10841 non-null object 11 Current Ver 10833 non-null object 12 Android Ver 10838 non-null object dtypes: float64(1), object(12) memory usage: 1.1+ MB
df_playstore.shape
(10841, 13)
### summary of the numerical variables
df_playstore.describe()
Rating | |
---|---|
count | 9367.000000 |
mean | 4.193338 |
std | 0.537431 |
min | 1.000000 |
25% | 4.000000 |
50% | 4.300000 |
75% | 4.500000 |
max | 19.000000 |
df_playstore.isnull().sum()
App 0 Category 0 Rating 1474 Reviews 0 Size 0 Installs 0 Type 1 Price 0 Content Rating 1 Genres 0 Last Updated 0 Current Ver 8 Android Ver 3 dtype: int64
### insights or observation
## The dataset has missing values
df_playstore['Reviews'].unique()
array(['159', '967', '87510', ..., '603', '1195', '398307'], shape=(6002,), dtype=object)
df_playstore['Reviews'].str.isnumeric().sum()
## (10841, 13) columns are present
## there is 1 column not in object format
np.int64(10840)
df_playstore[~df_playstore['Reviews'].str.isnumeric()]
App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
10472 | Life Made WI-Fi Touchscreen Photo Frame | 1.9 | 19.0 | 3.0M | 1,000+ | Free | 0 | Everyone | NaN | February 11, 2018 | 1.0.19 | 4.0 and up | NaN |
df_playstore_copy=df_playstore.copy()
df_playstore_copy
App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Photo Editor & Candy Camera & Grid & ScrapBook | ART_AND_DESIGN | 4.1 | 159 | 19M | 10,000+ | Free | 0 | Everyone | Art & Design | January 7, 2018 | 1.0.0 | 4.0.3 and up |
1 | Coloring book moana | ART_AND_DESIGN | 3.9 | 967 | 14M | 500,000+ | Free | 0 | Everyone | Art & Design;Pretend Play | January 15, 2018 | 2.0.0 | 4.0.3 and up |
2 | U Launcher Lite – FREE Live Cool Themes, Hide ... | ART_AND_DESIGN | 4.7 | 87510 | 8.7M | 5,000,000+ | Free | 0 | Everyone | Art & Design | August 1, 2018 | 1.2.4 | 4.0.3 and up |
3 | Sketch - Draw & Paint | ART_AND_DESIGN | 4.5 | 215644 | 25M | 50,000,000+ | Free | 0 | Teen | Art & Design | June 8, 2018 | Varies with device | 4.2 and up |
4 | Pixel Draw - Number Art Coloring Book | ART_AND_DESIGN | 4.3 | 967 | 2.8M | 100,000+ | Free | 0 | Everyone | Art & Design;Creativity | June 20, 2018 | 1.1 | 4.4 and up |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10836 | Sya9a Maroc - FR | FAMILY | 4.5 | 38 | 53M | 5,000+ | Free | 0 | Everyone | Education | July 25, 2017 | 1.48 | 4.1 and up |
10837 | Fr. Mike Schmitz Audio Teachings | FAMILY | 5.0 | 4 | 3.6M | 100+ | Free | 0 | Everyone | Education | July 6, 2018 | 1.0 | 4.1 and up |
10838 | Parkinson Exercices FR | MEDICAL | NaN | 3 | 9.5M | 1,000+ | Free | 0 | Everyone | Medical | January 20, 2017 | 1.0 | 2.2 and up |
10839 | The SCP Foundation DB fr nn5n | BOOKS_AND_REFERENCE | 4.5 | 114 | Varies with device | 1,000+ | Free | 0 | Mature 17+ | Books & Reference | January 19, 2015 | Varies with device | Varies with device |
10840 | iHoroscope - 2018 Daily Horoscope & Astrology | LIFESTYLE | 4.5 | 398307 | 19M | 10,000,000+ | Free | 0 | Everyone | Lifestyle | July 25, 2018 | Varies with device | Varies with device |
10841 rows × 13 columns
df_playstore_copy=df_playstore_copy.drop(df_playstore_copy.index[10472])
df_playstore_copy
App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Photo Editor & Candy Camera & Grid & ScrapBook | ART_AND_DESIGN | 4.1 | 159 | 19M | 10,000+ | Free | 0 | Everyone | Art & Design | January 7, 2018 | 1.0.0 | 4.0.3 and up |
1 | Coloring book moana | ART_AND_DESIGN | 3.9 | 967 | 14M | 500,000+ | Free | 0 | Everyone | Art & Design;Pretend Play | January 15, 2018 | 2.0.0 | 4.0.3 and up |
2 | U Launcher Lite – FREE Live Cool Themes, Hide ... | ART_AND_DESIGN | 4.7 | 87510 | 8.7M | 5,000,000+ | Free | 0 | Everyone | Art & Design | August 1, 2018 | 1.2.4 | 4.0.3 and up |
3 | Sketch - Draw & Paint | ART_AND_DESIGN | 4.5 | 215644 | 25M | 50,000,000+ | Free | 0 | Teen | Art & Design | June 8, 2018 | Varies with device | 4.2 and up |
4 | Pixel Draw - Number Art Coloring Book | ART_AND_DESIGN | 4.3 | 967 | 2.8M | 100,000+ | Free | 0 | Everyone | Art & Design;Creativity | June 20, 2018 | 1.1 | 4.4 and up |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10836 | Sya9a Maroc - FR | FAMILY | 4.5 | 38 | 53M | 5,000+ | Free | 0 | Everyone | Education | July 25, 2017 | 1.48 | 4.1 and up |
10837 | Fr. Mike Schmitz Audio Teachings | FAMILY | 5.0 | 4 | 3.6M | 100+ | Free | 0 | Everyone | Education | July 6, 2018 | 1.0 | 4.1 and up |
10838 | Parkinson Exercices FR | MEDICAL | NaN | 3 | 9.5M | 1,000+ | Free | 0 | Everyone | Medical | January 20, 2017 | 1.0 | 2.2 and up |
10839 | The SCP Foundation DB fr nn5n | BOOKS_AND_REFERENCE | 4.5 | 114 | Varies with device | 1,000+ | Free | 0 | Mature 17+ | Books & Reference | January 19, 2015 | Varies with device | Varies with device |
10840 | iHoroscope - 2018 Daily Horoscope & Astrology | LIFESTYLE | 4.5 | 398307 | 19M | 10,000,000+ | Free | 0 | Everyone | Lifestyle | July 25, 2018 | Varies with device | Varies with device |
10840 rows × 13 columns
df_playstore_copy[~df_playstore_copy['Reviews'].str.isnumeric()]
App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver |
---|
#### convert this review column in to int
df_playstore_copy['Reviews']=df_playstore_copy['Reviews'].astype(int)
df_playstore_copy.info()
<class 'pandas.core.frame.DataFrame'> Index: 10840 entries, 0 to 10840 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 App 10840 non-null object 1 Category 10840 non-null object 2 Rating 9366 non-null float64 3 Reviews 10840 non-null int64 4 Size 10840 non-null object 5 Installs 10840 non-null object 6 Type 10839 non-null object 7 Price 10840 non-null object 8 Content Rating 10840 non-null object 9 Genres 10840 non-null object 10 Last Updated 10840 non-null object 11 Current Ver 10832 non-null object 12 Android Ver 10838 non-null object dtypes: float64(1), int64(1), object(11) memory usage: 1.2+ MB
df_playstore_copy['Size'].unique()
array(['19M', '14M', '8.7M', '25M', '2.8M', '5.6M', '29M', '33M', '3.1M', '28M', '12M', '20M', '21M', '37M', '2.7M', '5.5M', '17M', '39M', '31M', '4.2M', '7.0M', '23M', '6.0M', '6.1M', '4.6M', '9.2M', '5.2M', '11M', '24M', 'Varies with device', '9.4M', '15M', '10M', '1.2M', '26M', '8.0M', '7.9M', '56M', '57M', '35M', '54M', '201k', '3.6M', '5.7M', '8.6M', '2.4M', '27M', '2.5M', '16M', '3.4M', '8.9M', '3.9M', '2.9M', '38M', '32M', '5.4M', '18M', '1.1M', '2.2M', '4.5M', '9.8M', '52M', '9.0M', '6.7M', '30M', '2.6M', '7.1M', '3.7M', '22M', '7.4M', '6.4M', '3.2M', '8.2M', '9.9M', '4.9M', '9.5M', '5.0M', '5.9M', '13M', '73M', '6.8M', '3.5M', '4.0M', '2.3M', '7.2M', '2.1M', '42M', '7.3M', '9.1M', '55M', '23k', '6.5M', '1.5M', '7.5M', '51M', '41M', '48M', '8.5M', '46M', '8.3M', '4.3M', '4.7M', '3.3M', '40M', '7.8M', '8.8M', '6.6M', '5.1M', '61M', '66M', '79k', '8.4M', '118k', '44M', '695k', '1.6M', '6.2M', '18k', '53M', '1.4M', '3.0M', '5.8M', '3.8M', '9.6M', '45M', '63M', '49M', '77M', '4.4M', '4.8M', '70M', '6.9M', '9.3M', '10.0M', '8.1M', '36M', '84M', '97M', '2.0M', '1.9M', '1.8M', '5.3M', '47M', '556k', '526k', '76M', '7.6M', '59M', '9.7M', '78M', '72M', '43M', '7.7M', '6.3M', '334k', '34M', '93M', '65M', '79M', '100M', '58M', '50M', '68M', '64M', '67M', '60M', '94M', '232k', '99M', '624k', '95M', '8.5k', '41k', '292k', '11k', '80M', '1.7M', '74M', '62M', '69M', '75M', '98M', '85M', '82M', '96M', '87M', '71M', '86M', '91M', '81M', '92M', '83M', '88M', '704k', '862k', '899k', '378k', '266k', '375k', '1.3M', '975k', '980k', '4.1M', '89M', '696k', '544k', '525k', '920k', '779k', '853k', '720k', '713k', '772k', '318k', '58k', '241k', '196k', '857k', '51k', '953k', '865k', '251k', '930k', '540k', '313k', '746k', '203k', '26k', '314k', '239k', '371k', '220k', '730k', '756k', '91k', '293k', '17k', '74k', '14k', '317k', '78k', '924k', '902k', '818k', '81k', '939k', '169k', '45k', '475k', '965k', '90M', '545k', '61k', '283k', '655k', '714k', '93k', '872k', '121k', '322k', '1.0M', '976k', '172k', '238k', '549k', '206k', '954k', '444k', '717k', '210k', '609k', '308k', '705k', '306k', '904k', '473k', '175k', '350k', '383k', '454k', '421k', '70k', '812k', '442k', '842k', '417k', '412k', '459k', '478k', '335k', '782k', '721k', '430k', '429k', '192k', '200k', '460k', '728k', '496k', '816k', '414k', '506k', '887k', '613k', '243k', '569k', '778k', '683k', '592k', '319k', '186k', '840k', '647k', '191k', '373k', '437k', '598k', '716k', '585k', '982k', '222k', '219k', '55k', '948k', '323k', '691k', '511k', '951k', '963k', '25k', '554k', '351k', '27k', '82k', '208k', '913k', '514k', '551k', '29k', '103k', '898k', '743k', '116k', '153k', '209k', '353k', '499k', '173k', '597k', '809k', '122k', '411k', '400k', '801k', '787k', '237k', '50k', '643k', '986k', '97k', '516k', '837k', '780k', '961k', '269k', '20k', '498k', '600k', '749k', '642k', '881k', '72k', '656k', '601k', '221k', '228k', '108k', '940k', '176k', '33k', '663k', '34k', '942k', '259k', '164k', '458k', '245k', '629k', '28k', '288k', '775k', '785k', '636k', '916k', '994k', '309k', '485k', '914k', '903k', '608k', '500k', '54k', '562k', '847k', '957k', '688k', '811k', '270k', '48k', '329k', '523k', '921k', '874k', '981k', '784k', '280k', '24k', '518k', '754k', '892k', '154k', '860k', '364k', '387k', '626k', '161k', '879k', '39k', '970k', '170k', '141k', '160k', '144k', '143k', '190k', '376k', '193k', '246k', '73k', '658k', '992k', '253k', '420k', '404k', '470k', '226k', '240k', '89k', '234k', '257k', '861k', '467k', '157k', '44k', '676k', '67k', '552k', '885k', '1020k', '582k', '619k'], dtype=object)
df_playstore_copy['Size'].isnull().sum()
np.int64(0)
## converting the millions in to K units - 19000k = 19M
df_playstore_copy['Size']=df_playstore_copy['Size'].str.replace('M','000')
### removiing k since all are in k units
df_playstore_copy['Size']=df_playstore_copy['Size'].str.replace('K','')
#### THE ABOVE "k" did'nt replcae, so just copying the 'K' from the output from previous and replacing and it got replcaed
df_playstore_copy['Size']=df_playstore_copy['Size'].str.replace('k','')
df_playstore_copy['Size'].unique()
array(['19000', '14000', '8.7000', '25000', '2.8000', '5.6000', '29000', '33000', '3.1000', '28000', '12000', '20000', '21000', '37000', '2.7000', '5.5000', '17000', '39000', '31000', '4.2000', '7.0000', '23000', '6.0000', '6.1000', '4.6000', '9.2000', '5.2000', '11000', '24000', 'Varies with device', '9.4000', '15000', '10000', '1.2000', '26000', '8.0000', '7.9000', '56000', '57000', '35000', '54000', '201', '3.6000', '5.7000', '8.6000', '2.4000', '27000', '2.5000', '16000', '3.4000', '8.9000', '3.9000', '2.9000', '38000', '32000', '5.4000', '18000', '1.1000', '2.2000', '4.5000', '9.8000', '52000', '9.0000', '6.7000', '30000', '2.6000', '7.1000', '3.7000', '22000', '7.4000', '6.4000', '3.2000', '8.2000', '9.9000', '4.9000', '9.5000', '5.0000', '5.9000', '13000', '73000', '6.8000', '3.5000', '4.0000', '2.3000', '7.2000', '2.1000', '42000', '7.3000', '9.1000', '55000', '23', '6.5000', '1.5000', '7.5000', '51000', '41000', '48000', '8.5000', '46000', '8.3000', '4.3000', '4.7000', '3.3000', '40000', '7.8000', '8.8000', '6.6000', '5.1000', '61000', '66000', '79', '8.4000', '118', '44000', '695', '1.6000', '6.2000', '18', '53000', '1.4000', '3.0000', '5.8000', '3.8000', '9.6000', '45000', '63000', '49000', '77000', '4.4000', '4.8000', '70000', '6.9000', '9.3000', '10.0000', '8.1000', '36000', '84000', '97000', '2.0000', '1.9000', '1.8000', '5.3000', '47000', '556', '526', '76000', '7.6000', '59000', '9.7000', '78000', '72000', '43000', '7.7000', '6.3000', '334', '34000', '93000', '65000', '79000', '100000', '58000', '50000', '68000', '64000', '67000', '60000', '94000', '232', '99000', '624', '95000', '8.5', '41', '292', '11', '80000', '1.7000', '74000', '62000', '69000', '75000', '98000', '85000', '82000', '96000', '87000', '71000', '86000', '91000', '81000', '92000', '83000', '88000', '704', '862', '899', '378', '266', '375', '1.3000', '975', '980', '4.1000', '89000', '696', '544', '525', '920', '779', '853', '720', '713', '772', '318', '58', '241', '196', '857', '51', '953', '865', '251', '930', '540', '313', '746', '203', '26', '314', '239', '371', '220', '730', '756', '91', '293', '17', '74', '14', '317', '78', '924', '902', '818', '81', '939', '169', '45', '475', '965', '90000', '545', '61', '283', '655', '714', '93', '872', '121', '322', '1.0000', '976', '172', '238', '549', '206', '954', '444', '717', '210', '609', '308', '705', '306', '904', '473', '175', '350', '383', '454', '421', '70', '812', '442', '842', '417', '412', '459', '478', '335', '782', '721', '430', '429', '192', '200', '460', '728', '496', '816', '414', '506', '887', '613', '243', '569', '778', '683', '592', '319', '186', '840', '647', '191', '373', '437', '598', '716', '585', '982', '222', '219', '55', '948', '323', '691', '511', '951', '963', '25', '554', '351', '27', '82', '208', '913', '514', '551', '29', '103', '898', '743', '116', '153', '209', '353', '499', '173', '597', '809', '122', '411', '400', '801', '787', '237', '50', '643', '986', '97', '516', '837', '780', '961', '269', '20', '498', '600', '749', '642', '881', '72', '656', '601', '221', '228', '108', '940', '176', '33', '663', '34', '942', '259', '164', '458', '245', '629', '28', '288', '775', '785', '636', '916', '994', '309', '485', '914', '903', '608', '500', '54', '562', '847', '957', '688', '811', '270', '48', '329', '523', '921', '874', '981', '784', '280', '24', '518', '754', '892', '154', '860', '364', '387', '626', '161', '879', '39', '970', '170', '141', '160', '144', '143', '190', '376', '193', '246', '73', '658', '992', '253', '420', '404', '470', '226', '240', '89', '234', '257', '861', '467', '157', '44', '676', '67', '552', '885', '1020', '582', '619'], dtype=object)
df_playstore_copy.info()
<class 'pandas.core.frame.DataFrame'> Index: 10840 entries, 0 to 10840 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 App 10840 non-null object 1 Category 10840 non-null object 2 Rating 9366 non-null float64 3 Reviews 10840 non-null int64 4 Size 10840 non-null object 5 Installs 10840 non-null object 6 Type 10839 non-null object 7 Price 10840 non-null object 8 Content Rating 10840 non-null object 9 Genres 10840 non-null object 10 Last Updated 10840 non-null object 11 Current Ver 10832 non-null object 12 Android Ver 10838 non-null object dtypes: float64(1), int64(1), object(11) memory usage: 1.2+ MB
### Varies with device is completely a string and replacing with a nan now
df_playstore_copy['Size']=df_playstore_copy['Size'].replace('Varies with device',np.nan)
df_playstore_copy['Size']=df_playstore_copy['Size'].astype(float)
df_playstore_copy['Size']
0 19000.0 1 14000.0 2 8.7 3 25000.0 4 2.8 ... 10836 53000.0 10837 3.6 10838 9.5 10839 NaN 10840 19000.0 Name: Size, Length: 10840, dtype: float64
df_playstore_copy.info()
<class 'pandas.core.frame.DataFrame'> Index: 10840 entries, 0 to 10840 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 App 10840 non-null object 1 Category 10840 non-null object 2 Rating 9366 non-null float64 3 Reviews 10840 non-null int64 4 Size 9145 non-null float64 5 Installs 10840 non-null object 6 Type 10839 non-null object 7 Price 10840 non-null object 8 Content Rating 10840 non-null object 9 Genres 10840 non-null object 10 Last Updated 10840 non-null object 11 Current Ver 10832 non-null object 12 Android Ver 10838 non-null object dtypes: float64(2), int64(1), object(10) memory usage: 1.2+ MB
df_playstore_copy['Installs'].unique()
array(['10,000+', '500,000+', '5,000,000+', '50,000,000+', '100,000+', '50,000+', '1,000,000+', '10,000,000+', '5,000+', '100,000,000+', '1,000,000,000+', '1,000+', '500,000,000+', '50+', '100+', '500+', '10+', '1+', '5+', '0+', '0'], dtype=object)
df_playstore_copy['Price'].unique()
array(['0', '$4.99', '$3.99', '$6.99', '$1.49', '$2.99', '$7.99', '$5.99', '$3.49', '$1.99', '$9.99', '$7.49', '$0.99', '$9.00', '$5.49', '$10.00', '$24.99', '$11.99', '$79.99', '$16.99', '$14.99', '$1.00', '$29.99', '$12.99', '$2.49', '$10.99', '$1.50', '$19.99', '$15.99', '$33.99', '$74.99', '$39.99', '$3.95', '$4.49', '$1.70', '$8.99', '$2.00', '$3.88', '$25.99', '$399.99', '$17.99', '$400.00', '$3.02', '$1.76', '$4.84', '$4.77', '$1.61', '$2.50', '$1.59', '$6.49', '$1.29', '$5.00', '$13.99', '$299.99', '$379.99', '$37.99', '$18.99', '$389.99', '$19.90', '$8.49', '$1.75', '$14.00', '$4.85', '$46.99', '$109.99', '$154.99', '$3.08', '$2.59', '$4.80', '$1.96', '$19.40', '$3.90', '$4.59', '$15.46', '$3.04', '$4.29', '$2.60', '$3.28', '$4.60', '$28.99', '$2.95', '$2.90', '$1.97', '$200.00', '$89.99', '$2.56', '$30.99', '$3.61', '$394.99', '$1.26', '$1.20', '$1.04'], dtype=object)
df_playstore_copy.info()
<class 'pandas.core.frame.DataFrame'> Index: 10840 entries, 0 to 10840 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 App 10840 non-null object 1 Category 10840 non-null object 2 Rating 9366 non-null float64 3 Reviews 10840 non-null int64 4 Size 9145 non-null float64 5 Installs 10840 non-null object 6 Type 10839 non-null object 7 Price 10840 non-null object 8 Content Rating 10840 non-null object 9 Genres 10840 non-null object 10 Last Updated 10840 non-null object 11 Current Ver 10832 non-null object 12 Android Ver 10838 non-null object dtypes: float64(2), int64(1), object(10) memory usage: 1.2+ MB
chars_to_remove=['+',',','$']
cols_to_clean=['Installs','Price']
for item in chars_to_remove:
for cols in cols_to_clean:
df_playstore_copy[cols]=df_playstore_copy[cols].str.replace(item,'')
df_playstore_copy['Price'].unique()
array(['0', '4.99', '3.99', '6.99', '1.49', '2.99', '7.99', '5.99', '3.49', '1.99', '9.99', '7.49', '0.99', '9.00', '5.49', '10.00', '24.99', '11.99', '79.99', '16.99', '14.99', '1.00', '29.99', '12.99', '2.49', '10.99', '1.50', '19.99', '15.99', '33.99', '74.99', '39.99', '3.95', '4.49', '1.70', '8.99', '2.00', '3.88', '25.99', '399.99', '17.99', '400.00', '3.02', '1.76', '4.84', '4.77', '1.61', '2.50', '1.59', '6.49', '1.29', '5.00', '13.99', '299.99', '379.99', '37.99', '18.99', '389.99', '19.90', '8.49', '1.75', '14.00', '4.85', '46.99', '109.99', '154.99', '3.08', '2.59', '4.80', '1.96', '19.40', '3.90', '4.59', '15.46', '3.04', '4.29', '2.60', '3.28', '4.60', '28.99', '2.95', '2.90', '1.97', '200.00', '89.99', '2.56', '30.99', '3.61', '394.99', '1.26', '1.20', '1.04'], dtype=object)
df_playstore_copy['Installs'].unique()
array(['10000', '500000', '5000000', '50000000', '100000', '50000', '1000000', '10000000', '5000', '100000000', '1000000000', '1000', '500000000', '50', '100', '500', '10', '1', '5', '0'], dtype=object)
df_playstore_copy['Installs']=df_playstore_copy['Installs'].astype('int')
df_playstore_copy['Price']=df_playstore_copy['Price'].astype('float')
df_playstore_copy.info()
<class 'pandas.core.frame.DataFrame'> Index: 10840 entries, 0 to 10840 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 App 10840 non-null object 1 Category 10840 non-null object 2 Rating 9366 non-null float64 3 Reviews 10840 non-null int64 4 Size 9145 non-null float64 5 Installs 10840 non-null int64 6 Type 10839 non-null object 7 Price 10840 non-null float64 8 Content Rating 10840 non-null object 9 Genres 10840 non-null object 10 Last Updated 10840 non-null object 11 Current Ver 10832 non-null object 12 Android Ver 10838 non-null object dtypes: float64(3), int64(2), object(8) memory usage: 1.2+ MB
### handling the last updated
df_playstore_copy['Last Updated'].unique()
array(['January 7, 2018', 'January 15, 2018', 'August 1, 2018', ..., 'January 20, 2014', 'February 16, 2014', 'March 23, 2014'], shape=(1377,), dtype=object)
df_playstore_copy['Last Updated']= pd.to_datetime(df_playstore_copy['Last Updated'])
df_playstore_copy['Day_Last_Updated']=df_playstore_copy['Last Updated'].dt.day
df_playstore_copy['Month_Last_Updated']=df_playstore_copy['Last Updated'].dt.month
df_playstore_copy['Year_Last_Updated']=df_playstore_copy['Last Updated'].dt.year
df_playstore_copy.info()
<class 'pandas.core.frame.DataFrame'> Index: 10840 entries, 0 to 10840 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 App 10840 non-null object 1 Category 10840 non-null object 2 Rating 9366 non-null float64 3 Reviews 10840 non-null int64 4 Size 9145 non-null float64 5 Installs 10840 non-null int64 6 Type 10839 non-null object 7 Price 10840 non-null float64 8 Content Rating 10840 non-null object 9 Genres 10840 non-null object 10 Last Updated 10840 non-null datetime64[ns] 11 Current Ver 10832 non-null object 12 Android Ver 10838 non-null object 13 Day_Last_Updated 10840 non-null int32 14 Month_Last_Updated 10840 non-null int32 15 Year_Last_Updated 10840 non-null int32 dtypes: datetime64[ns](1), float64(3), int32(3), int64(2), object(7) memory usage: 1.3+ MB
df_playstore_copy.head(2)
App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | Day_Last_Updated | Month_Last_Updated | Year_Last_Updated | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Photo Editor & Candy Camera & Grid & ScrapBook | ART_AND_DESIGN | 4.1 | 159 | 19000.0 | 10000 | Free | 0.0 | Everyone | Art & Design | 2018-01-07 | 1.0.0 | 4.0.3 and up | 7 | 1 | 2018 |
1 | Coloring book moana | ART_AND_DESIGN | 3.9 | 967 | 14000.0 | 500000 | Free | 0.0 | Everyone | Art & Design;Pretend Play | 2018-01-15 | 2.0.0 | 4.0.3 and up | 15 | 1 | 2018 |
### saving the file to new name
# df_playstore_copy.to_csv('google_cleaned_v_01.csv')
df_playstore_copy[df_playstore_copy.duplicated('App')].shape
(1181, 16)
there are 1181 duplicates
df_playstore_copy=df_playstore_copy.drop_duplicates(subset=['App'],keep='first')
df_playstore_copy[df_playstore_copy.duplicated('App')].shape
(0, 16)
df_playstore_copy.shape
(9659, 16)
#### Explore data
numeric_features=[feature for feature in df_playstore_copy.columns if df_playstore_copy[feature].dtype !='O']
categorical_features=[feature for feature in df_playstore_copy.columns if df_playstore_copy[feature].dtype == 'O']
## print columns
print('we have {} numerical fetures: {}'.format(len(numeric_features),numeric_features))
print('\n we have {} categorical features :{}'.format(len(categorical_features),categorical_features))
we have 9 numerical fetures: ['Rating', 'Reviews', 'Size', 'Installs', 'Price', 'Last Updated', 'Day_Last_Updated', 'Month_Last_Updated', 'Year_Last_Updated'] we have 7 categorical features :['App', 'Category', 'Type', 'Content Rating', 'Genres', 'Current Ver', 'Android Ver']
### what is the propotion of count data for every categorical columns
for col in categorical_features:
print(df_playstore_copy[col].value_counts(normalize=True)*100) ## normalize will values in % rather than in numbers
print('------------------------')
App iHoroscope - 2018 Daily Horoscope & Astrology 0.010353 Photo Editor & Candy Camera & Grid & ScrapBook 0.010353 Coloring book moana 0.010353 U Launcher Lite – FREE Live Cool Themes, Hide Apps 0.010353 Sketch - Draw & Paint 0.010353 ... Learn To Draw Kawaii Characters 0.010353 3D Color Pixel by Number - Sandbox Art Coloring 0.010353 Mandala Coloring Book 0.010353 Tattoo Name On My Photo Editor 0.010353 Name Art Photo Editor - Focus n Filters 0.010353 Name: proportion, Length: 9659, dtype: float64 ------------------------ Category FAMILY 18.966767 GAME 9.928564 TOOLS 8.561963 BUSINESS 4.348276 MEDICAL 4.089450 PERSONALIZATION 3.892743 PRODUCTIVITY 3.872036 LIFESTYLE 3.820271 FINANCE 3.571798 SPORTS 3.364738 COMMUNICATION 3.261207 HEALTH_AND_FITNESS 2.981675 PHOTOGRAPHY 2.909204 NEWS_AND_MAGAZINES 2.629672 SOCIAL 2.474376 BOOKS_AND_REFERENCE 2.298375 TRAVEL_AND_LOCAL 2.267315 SHOPPING 2.091314 DATING 1.770370 VIDEO_PLAYERS 1.687545 MAPS_AND_NAVIGATION 1.356248 EDUCATION 1.232012 FOOD_AND_DRINK 1.159540 ENTERTAINMENT 1.056010 AUTO_AND_VEHICLES 0.880008 LIBRARIES_AND_DEMO 0.869655 WEATHER 0.817890 HOUSE_AND_HOME 0.766125 ART_AND_DESIGN 0.662594 EVENTS 0.662594 PARENTING 0.621182 COMICS 0.579770 BEAUTY 0.548711 Name: proportion, dtype: float64 ------------------------ Type Free 92.172292 Paid 7.827708 Name: proportion, dtype: float64 ------------------------ Content Rating Everyone 81.820064 Teen 10.725748 Mature 17+ 4.068744 Everyone 10+ 3.333678 Adults only 18+ 0.031059 Unrated 0.020706 Name: proportion, dtype: float64 ------------------------ Genres Tools 8.551610 Entertainment 5.808055 Education 5.280050 Business 4.348276 Medical 4.089450 ... Role Playing;Brain Games 0.010353 Strategy;Education 0.010353 Racing;Pretend Play 0.010353 Communication;Creativity 0.010353 Strategy;Creativity 0.010353 Name: proportion, Length: 118, dtype: float64 ------------------------ Current Ver Varies with device 10.931510 1.0 8.278935 1.1 2.694021 1.2 1.823645 2.0 1.543881 ... 0.7.1 0.010362 4.6.71 0.010362 2.0.148.0 0.010362 1.0.0.96 0.010362 1.022 0.010362 Name: proportion, Length: 2817, dtype: float64 ------------------------ Android Ver 4.1 and up 22.802112 4.0.3 and up 14.445480 4.0 and up 13.306410 Varies with device 10.251631 4.4 and up 8.470540 2.3 and up 6.378793 5.0 and up 5.301854 4.2 and up 3.852128 2.3.3 and up 2.826965 2.2 and up 2.474889 3.0 and up 2.392047 4.3 and up 2.288495 2.1 and up 1.377239 1.6 and up 1.201201 6.0 and up 0.559180 7.0 and up 0.434918 3.2 and up 0.372787 2.0 and up 0.331366 5.1 and up 0.227814 1.5 and up 0.207104 4.4W and up 0.113907 3.1 and up 0.103552 2.0.1 and up 0.072486 8.0 and up 0.062131 7.1 and up 0.031066 5.0 - 8.0 0.020710 4.0.3 - 7.1.1 0.020710 1.0 and up 0.020710 7.0 - 7.1.1 0.010355 4.1 - 7.1.1 0.010355 5.0 - 6.0 0.010355 2.2 - 7.1.1 0.010355 5.0 - 7.1.1 0.010355 Name: proportion, dtype: float64 ------------------------
### proportion of count data on numerical columns
plt.Figure(figsize=(15,15))
plt.suptitle('Univariate Analysis of numberical features', fontsize=20, fontweight='bold', alpha=0.8, y=1.)
for i in range(0, len(numeric_features)):
plt.subplot(5, 3, i+1)
sns.kdeplot(x=df_playstore_copy[numeric_features[i]],shade=True,color='r')
plt.xlabel(numeric_features[i])
plt.tight_layout()
rating and year is left skewed while reviews, size, installs and prices are right skewed
## Observation from the above plot
## rating and year is left skewed while reviews, size, installs and prices are right skewed
# categorical variables
plt.Figure(figsize=(20,15))
plt.suptitle('Univariate Analysis of numberical features', fontsize=20, fontweight='bold', alpha=0.8, y=1.)
### category is the column name from the dataset
category =['Type','Content Rating']
for i in range(0, len(category)):
plt.subplot(2, 2, i+1)
## countplot
sns.countplot(x=df_playstore_copy[category[i]],palette="Set2")
plt.xlabel(category[i])
plt.xticks(rotation=45)
plt.tight_layout()
## which is the most popular app category
df_playstore_copy.head(2)
App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | Day_Last_Updated | Month_Last_Updated | Year_Last_Updated | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Photo Editor & Candy Camera & Grid & ScrapBook | ART_AND_DESIGN | 4.1 | 159 | 19000.0 | 10000 | Free | 0.0 | Everyone | Art & Design | 2018-01-07 | 1.0.0 | 4.0.3 and up | 7 | 1 | 2018 |
1 | Coloring book moana | ART_AND_DESIGN | 3.9 | 967 | 14000.0 | 500000 | Free | 0.0 | Everyone | Art & Design;Pretend Play | 2018-01-15 | 2.0.0 | 4.0.3 and up | 15 | 1 | 2018 |
df_playstore_copy['Category'].value_counts().plot.pie(y=df_playstore_copy['Category'],figsize=(15,16),autopct='%1.1f%%' )
<Axes: ylabel='count'>
## Obzervation
'''
1. there are more kinds o fapps in the play store whcih are under category of family, games and tools
2. beauty,comics,arts and weather kinds of apps are very less in playstore
'''
'\n1. there are more kinds o fapps in the play store whcih are under category of family, games and tools\n2. beauty,comics,arts and weather kinds of apps are very less in playstore\n'
## Top ten app categories
category=pd.DataFrame(df_playstore_copy['Category'].value_counts())
category.rename(columns={'category':'count'},inplace=True)
category
count | |
---|---|
Category | |
FAMILY | 1832 |
GAME | 959 |
TOOLS | 827 |
BUSINESS | 420 |
MEDICAL | 395 |
PERSONALIZATION | 376 |
PRODUCTIVITY | 374 |
LIFESTYLE | 369 |
FINANCE | 345 |
SPORTS | 325 |
COMMUNICATION | 315 |
HEALTH_AND_FITNESS | 288 |
PHOTOGRAPHY | 281 |
NEWS_AND_MAGAZINES | 254 |
SOCIAL | 239 |
BOOKS_AND_REFERENCE | 222 |
TRAVEL_AND_LOCAL | 219 |
SHOPPING | 202 |
DATING | 171 |
VIDEO_PLAYERS | 163 |
MAPS_AND_NAVIGATION | 131 |
EDUCATION | 119 |
FOOD_AND_DRINK | 112 |
ENTERTAINMENT | 102 |
AUTO_AND_VEHICLES | 85 |
LIBRARIES_AND_DEMO | 84 |
WEATHER | 79 |
HOUSE_AND_HOME | 74 |
ART_AND_DESIGN | 64 |
EVENTS | 64 |
PARENTING | 60 |
COMICS | 56 |
BEAUTY | 53 |
### Top 10 Apps
plt.Figure(figsize=(15,6))
sns.barplot(x=category.index[:10],y='count',data=category[:10],palette='hls')
plt.xticks(rotation=90)
plt.show()
## Observation
'''
1. family category has the most number of the apps with 18% of the apps belonging to it, followed by games category which has 11% of the apps
2. least number of the apps belong with less than 1% of the toatl apps belonging to it.
'''
' \n1. family category has the most number of the apps with 18% of the apps belonging to it, followed by games category which has 11% of the apps\n2. least number of the apps belong with less than 1% of the toatl apps belonging to it. \n'
Comments
Post a Comment