0

I am trying to index in a large data frame. The sdata frame has 3 million observations and 26 variables (at bottom).

setsize <- 6
eccent <- 150
ctrX <- 400
ctrY <- 300

xyrotate <- function(x,y,ctrX,ctrY,angle){
  distX <- x - ctrX;
  distY <- y - ctrY;
  radians <- angle * (pi/180);
  rotX <- ctrX + (distX*cos(radians)) - (distY*sin(radians));
  rotY <- ctrY + (distX*sin(radians)) + (distY*cos(radians));
  coordinates <- list("X" = rotX,"Y" = rotY)
  return(coordinates)
}

loc <- data.frame(x = numeric(setsize),
              y = numeric(setsize))
loc$x[1] <- ctrX
loc$y[1] <- ctrY - eccent
for(i in 2:setsize){
  coord <- xyrotate(loc$x[1], loc$y[1],ctrX,ctrY,(i-1)*(360/setsize)) 
  loc$x[i] <- coord$X
  loc$y[i] <- coord$Y
}
gazedist <- matrix(nrow=nrow(sdata), ncol = setsize)
for(d in 1:setsize){
  x <- sdata$RIGHT_GAZE_X-loc$x[d]
  y <- sdata$RIGHT_GAZE_Y-loc$y[d]
  gazedist[,d] <- sqrt(x^2+y^2)
}
sdata$gdist_T <- 0
sdata$gdist_T <- gazedist[ ,sdata$t_targLoc]

The last line here causes R to crash. Is there some way to plug in the values of sdata$t_targLoc[i] into the d value of gazedistance[i,d]. The for loop equivalent would be:

for(i in 1:length(gazedist)){
   sdata$gdist_T[i] <- gazedist[i,sdata$t_targLoc[i]]
}

But that will be slow in R...

sdata structure

structure(list(RIGHT_GAZE_X = c(409.5, 409.6, 409.5, 409.4, 409.3, 
409.2, 409.1, 409, 408.9, 408.8), RIGHT_GAZE_Y = c(291.9, 291.5, 
290.9, 290.3, 290.3, 290.3, 289.8, 289.2, 288.7, 288.8), RECORDING_SESSION_LABEL = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "ET101", class = "factor"), 
    t_block = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), t_trialNum = c(129L, 
    129L, 129L, 129L, 129L, 129L, 129L, 129L, 129L, 129L), t_subjNum = c(101L, 
    101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L), t_colCond = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "green", class = "factor"), 
    t_targLoc = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), t_targID = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "L", class = "factor"), 
    t_targShape = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L), .Label = "diamond", class = "factor"), t_singLoc = c(5L, 
    5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), t_singPres = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "P", class = "factor"), 
    t_singDist = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), t_singAngle = c(120L, 
    120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L), t_targAngle = c(120L, 
    120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L), t_RESP = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "L", class = "factor"), 
    t_ACC = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), t_RT = c(686.1062, 
    686.1062, 686.1062, 686.1062, 686.1062, 686.1062, 686.1062, 
    686.1062, 686.1062, 686.1062), TRIAL_START_TIME = c(1031031L, 
    1031031L, 1031031L, 1031031L, 1031031L, 1031031L, 1031031L, 
    1031031L, 1031031L, 1031031L), TIMESTAMP = c(1031030, 1031032, 
    1031034, 1031036, 1031038, 1031040, 1031042, 1031044, 1031046, 
    1031048), IP_START_TIME = c(1031031L, 1031031L, 1031031L, 
    1031031L, 1031031L, 1031031L, 1031031L, 1031031L, 1031031L, 
    1031031L), currtime = c(0, 2, 4, 6, 8, 10, 12, 14, 16, 18
    ), currsamp = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), gdist_T = c(0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0), gdist_S = c(0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0), gdist_NS = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("RIGHT_GAZE_X", 
"RIGHT_GAZE_Y", "RECORDING_SESSION_LABEL", "t_block", "t_trialNum", 
"t_subjNum", "t_colCond", "t_targLoc", "t_targID", "t_targShape", 
"t_singLoc", "t_singPres", "t_singDist", "t_singAngle", "t_targAngle", 
"t_RESP", "t_ACC", "t_RT", "TRIAL_START_TIME", "TIMESTAMP", "IP_START_TIME", 
"currtime", "currsamp", "gdist_T", "gdist_S", "gdist_NS"), row.names = 53170:53179, class = "data.frame")
2
  • 1
    You realize that you don't have to upload all the rows, just perhaps the first 10? Commented Sep 30, 2015 at 23:05
  • 1
    "The last line here causes R to crash." -> with what error message? The line in itself is ok, completely equivalent to the loop (so both work or fail). Commented Sep 30, 2015 at 23:34

3 Answers 3

2

It seems like you want to grab the ith row and the sdist$$t_targLoc[i] column of gazedist. There is a built-in for that. Use:

sdata$gdist_T <- gazedist[cbind(1:nrow(gazedist),sdata$t_targLoc)]

Here's an example:

m <- matrix(1:25,nc=5)
m
#      [,1] [,2] [,3] [,4] [,5]
# [1,]    1    6   11   16   21
# [2,]    2    7   12   17   22
# [3,]    3    8   13   18   23
# [4,]    4    9   14   19   24
# [5,]    5   10   15   20   25
v <- c(1,3,5,2,4)
m[cbind(1:5,v)]
# [1]  1 12 23  9 20

BTW, your first for loop can be replaced by:

loc <- as.data.frame(xyrotate(ctrX,ctrY-eccent, ctrX,ctrY,(1:(setsize-1))*360/setsize))
loc <- rbind(c(X=ctrX,Y=ctrY-eccent),lloc)

Your second for loop can be replaced by, e.g.,

f <- function(x,y) {
  x <- sdata$RIGHT_GAZE_X-x
  y <- sdata$RIGHT_GAZE_Y-y
  sqrt(x^2+y^2)
}
ggazedist <- mapply(f, loc$x, loc$y)
identical(gazedist,ggazedist)
# [1] TRUE
Sign up to request clarification or add additional context in comments.

Comments

1

Yeah, I think this works. Essentially you want to get a vector out of a matrix, where the row indices are 1:nrow(sdat) and the column indices are sdata$t_targLoc. This isn't built-in (that I know of), but we can convert the matrix to a vector and grab the right values.

gazedist_vals = as.vector(gazedist)
rows = 1:nrow(sdat)
cols = sdat$t_targLoc
indices = (cols - 1) * nrow(gazedist) + rows
sdata$gdist_T = gazedist_vals[indices]

I think that will do what you want.

Your data excerpt doesn't illustrate this very well since t_targLoc is always 3. Here's a little illustration:

x = matrix(c(5,2,65,8,4,2), nrow = 2)
x
#      [,1] [,2] [,3]
# [1,]    5   65    4
# [2,]    2    8    2
as.vector(x)
# [1]  5  2 65  8  4  2
rows = c(1, 1, 2)
cols = c(3,2,1)
inds = (cols - 1) * nrow(x) + rows
as.vector(x)[inds]
# [1]  4 65  2

One comment: your question is thorough, but minimal examples are often preferable. You gave us 26 columns of data, of which only one a few were needed. You gave us functions and code to calculate distances, when you could have just given a distance matrix (and then only one data column would be necessary). A small example like my x matrix and rows and cols vectors might have been all you needed to show the problem.

Comments

0

You can, but I doubt you'll save much time unless you figure out a way to completely vectorize. In other words, you would have to avoid using functionals like apply or sapply, which are all based on for loops in C and hence not that much faster than normal for loops, if at all.

someFunction <- function(x) ifelse(x %in% seq(0, 50000, 100), 1, 0)

# Here you have "vectorized" the indexing
system.time(sapply(1:nrow(diamonds), someFunction))
# 2.6 elapsed secs

## vs here where you're just using a for loop

system.time(
for(i in 1:nrow(diamonds)) {
  k[i] <- someFunction(i)
}
)
# 2.7 elapsed secs

Comments

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.