The following code works and will be put into cron on a AWS server (not sure what specifications, cores, kernels etc) at a every 1 minute frequency.
However, it's my first time putting a script into production, so I'd like some help reviewing this code. Please look it over and point to any hidden mistakes or something that's risky to implement or is just bad design. Additionally, it currently runs within a minute because I've limited the design to optimize 100 rows of data at a time.
Could this be made faster and allow me to optimize even more rows? Hypothetically, I can have row count in the 10s of 1000s, but since I sort them according to my cost, they become less valuable down below- but still if it can be done, it'll help after a while when the product scales.
Stack: MySQL where main data is stored & fetched. Redis database is where changes need to go. This script will be placed in a 1 minute cron in a separate server. All are on independant AWS servers. I haven't really taken any redis based optimization while building this, not sure how much of a difference it could make.
import MySQLdb
import pandas as pd
import datetime
import redis
from decimal import Decimal, DecimalException
import math
import cPickle as pickle
import re
from pandas.util.testing import assert_frame_equal
conn = MySQLdb.connect(host="AWS_HOST", user="root", passwd="pasword",db="db_name", local_infile=1)
cursor = conn.cursor(MySQLdb.cursors.DictCursor)
cursor.execute("SET NAMES utf8")
now = datetime.datetime.now()+datetime.timedelta(minutes=330)
subtract = now.minute if now.minute < 30 else now.minute-30
now-=datetime.timedelta(minutes=subtract)
read_query = """
select group, url, sum(acost)/1000000 cost, sum(rpm-acost)/1000000 pft,
sum(lg) imps, sum(acost)/1000000/sum(lg)*1000 cpm, sum(rpm)/1000000/sum(lg)*1000 rpm,
sum(rpm-acost)/1000000/sum(lg)*1000 ppm
from db_name.db
where dh = '{}'
group by group, url having pft < 0
order by sum(acost) desc;""".format(now.strftime('%Y-%m-%d %H:%M:00'))
cursor.execute(read_query)
cost_data_new = pd.DataFrame(list(cursor.fetchall()))
cost_data_old = pickle.load( open( "cost_data_old.p", "rb" ))
## manage structure
global changed,r
r = redis.Redis(host='AWS2_HOST', port=PORT, db=0)
changed=True
try:
if assert_frame_equal(cost_data_new, cost_data_old[['group','url','cost','pft','imps','cpm','rpm','ppm']], check_dtype=False):
changed=False
except:
changed=True
def rename_x(col_name):
if re.match(r".*\_(x)$", col_name):
return (col_name[:-2])
else:
return col_name
def cal_rpm(rw):
try:
return (rw['pft_diff']+rw['cost_diff'])/rw['imps_diff']*1000
except DecimalException:
return 0
except Exception as e:
raise e
#print rw['group'], rw['url'], rw['pft_diff'], rw['cost_diff'], rw['imps_diff']
print "proceeding with 0"
return 0
def cal_cpm(rw):
try:
return rw['cost_diff']/rw['imps_diff']*1000
except DecimalException:
return 0
except Exception as e:
raise e
#print rw['group'], rw['url'], rw['cost_diff'], rw['imps_diff']
print "proceeding with 0"
return 0
def cal_ppm(rw):
try:
return rw['pft_diff']/rw['imps_diff']*1000
except DecimalException:
return 0
except Exception as e:
raise e
print "proceeding with 0"
return 0
def get_diff(new,old):
diff = pd.merge(new,old,how='outer',on=['group','url'])
diff.fillna(0,inplace=True)
diff['pft_diff']=diff['pft_x']-diff['pft_y']
diff['cost_diff']=diff['cost_x']-diff['cost_y']
diff['imps_diff']=diff['imps_x']-diff['imps_y']
diff['diff_rpm']=diff[['group','url','pft_diff','cost_diff','imps_diff']].apply(cal_rpm,axis=1)
diff['diff_cpm']=diff[['group','url','cost_diff','imps_diff']].apply(cal_cpm,axis=1)
diff['diff_ppm']=diff[['group','url','pft_diff','imps_diff']].apply(cal_ppm,axis=1)
diff=diff.rename(columns=rename_x)
diff.drop(list(diff.filter(regex = '_y')), axis = 1, inplace = True)
try:
del diff['optimized']
except:
pass
return diff
def calc_bid_prob(lgpm):
beta = 0.01
alpha = 1
infl = 13
slope = 1
prob=int(100 * (beta + (alpha - beta) / (1 + math.pow((lgpm / infl), slope))))
return prob
def optimize_val(rw):
ppm = rw['diff_ppm']
lg = rw['imps_diff']
rpm = rw['diff_rpm']
cpm = rw['diff_cpm']
bid = r.hget("plcmnt:{}".format(rw['group']),"{}".format(rw['url']))
try:
bid=int(bid)
except:
bid=20
b_prob=r.hget("url:prob:{}".format(rw['group']),"{}".format(rw['url']))
try:
if ppm < -1:
if rpm >= 2:
new_bid = min((1-0.3)*rpm,bid)
new_bid_prob = min(calc_bid_prob(lg),b_prob)
r.hset("plcmnt:{}".format(rw['group']),rw['url'],new_bid)
r.hset("url:prob:{}".format(rw['group']),rw['url'],new_bid)
else:
new_bid = min((1-0.5)*rpm,bid)
new_bid_prob = min(calc_bid_prob(lg),b_prob)
r.hset("plcmnt:{}".format(rw['group']),rw['url'],new_bid)
r.hset("url:prob:{}".format(rw['group']),rw['url'],new_bid_prob)
elif (ppm >= -1) & (ppm < 0):
if rpm >= 2:
new_bid_prob = min(calc_bid_prob(lg),b_prob)
r.hset("url:prob:{}".format(rw['group']),rw['url'],new_bid_prob)
else:
new_bid = min((1-0.3)*rpm,bid)
new_bid_prob = min(calc_bid_prob(lg),b_prob)
r.hset("plcmnt:{}".format(rw['group']),rw['url'],new_bid)
r.hset("url:prob:{}".format(rw['group']),rw['url'],new_bid_prob)
else:
pass
return 1
except Exception as e:
#log exception
return 0
## if not changed, proceed with old, optimize unoptimized top 100 and store as new pickle
if changed:
## optimize diff_data
cost_data_diff=get_diff(cost_data_new,cost_data_old[['group','url','cost','pft','imps','cpm','rpm','ppm']])
cost_data_diff['optimized']=0
cost_data_diff.sort_values(by=['cost_diff'],ascending=False,inplace=True)
optimize=cost_data_diff.head(100)
optimize['optimized']=optimize.apply(optimize_val,axis=1)
pickle.dump(cost_data_diff,open("cost_data_old.p","wb"))
else:
cost_data_old.sort_values(by=['cost'],ascending=False,inplace=True)
optimize = cost_data_old[cost_data_old['optimized']==0].head(100)
optimize['optimized']=optimize.apply(optimize_val,axis=1)
pickle.dump(cost_data_old,open("cost_data_old.p","wb"))
import sys
sys.exit()
cost_data_old_example:
group acost_x cpm_x imps_x pft_x ppm_x rpm_x \
0 6841 0.0002 0.12150000 2 -0.0002 -0.12150000 0E-8
1 6891 0.0002 0.19900000 1 -0.0002 -0.19900000 0E-8
2 7174 0.0001 0.14900000 1 -0.0001 -0.14900000 0E-8
3 6732 0.0001 0.14600000 1 -0.0001 -0.14600000 0E-8
4 6882 0.0001 0.13500000 1 -0.0001 -0.13500000 0E-8
5 6856 0.0001 0.10700000 1 -0.0001 -0.10700000 0E-8
6 6838 0.0001 0.08700000 1 -0.0001 -0.08700000 0E-8
7 6838 0.0001 0.08600000 1 -0.0001 -0.08600000 0E-8
8 6838 0.0001 0.08600000 1 -0.0001 -0.08600000 0E-8
url cost_y pft_y imps_y \
0 url1.org 0.0002 -0.0002 2
1 url2.com 0.0002 -0.0002 1
2 url3.com 0.0001 -0.0001 1
3 url4.tv 0.0001 -0.0001 1
4 url5.com 0.0001 -0.0001 1
5 url6.com 0.0001 -0.0001 1
6 url7.com 0.0001 -0.0001 1
7 url8.com 0.0001 -0.0001 1
8 url9.com 0.0001 -0.0001 1
cpm_y rpm_y ppm_y pft_diff cost_diff imps_diff
0 0.12150000 0E-8 -0.12150000 0.0000 0.0000 0
1 0.19900000 0E-8 -0.19900000 0.0000 0.0000 0
2 0.14900000 0E-8 -0.14900000 0.0000 0.0000 0
3 0.14600000 0E-8 -0.14600000 0.0000 0.0000 0
4 0.13500000 0E-8 -0.13500000 0.0000 0.0000 0
5 0.10700000 0E-8 -0.10700000 0.0000 0.0000 0
6 0.08700000 0E-8 -0.08700000 0.0000 0.0000 0
7 0.08600000 0E-8 -0.08600000 0.0000 0.0000 0
8 0.08600000 0E-8 -0.08600000 0.0000 0.0000 0
cost_data_new_shape:
(1587, 8)