For each state, find the percentage of female and male population.
brfss <- read.csv("http://www.hofroe.net/stat579/brfss%2009/brfss-clean.csv")
library(plyr)
pct.states <- ddply(brfss, .(X_STATE), summarise,
pctmale = sum(SEX==1)/length(SEX),
pctfemale = sum(SEX==2)/length(SEX)
)
The file fips-code.csv has a list of state names by FIPS code - used in the variable XSTATE in the BRFSS data. Use this data and the states data from the maps
package to draw a map of the U.S. (mainland only) with states colored by percentage of male respondents.
fips <- read.csv("http://www.hofroe.net/stat579/brfss%2009/fips-code.csv")
head(fips)
fips$region <- tolower(fips$State.Name)
library(maps)
library(ggplot2)
states <- map_data("state")
statesfips <- merge(states, fips, by="region")
statesbrfss <- merge(statesfips, pct.states, by.x="FIPS.Code", by.y="X_STATE")
qplot(long, lat, geom="polygon", group=group, order=order, data=statesbrfss, fill=pctmale)
# some states do not show up or are wrongly named ....
Write a function palindrome5(x)
that is TRUE
, if x
is a palindrome of length 5 (i.e. x is five letters long and can be read from the back or the front, e.g. 'radar', 'level'). Extra points, if you can avoid an explicit loop or recursion.
palindrome5 <- function(x) {
if (nchar(x) != 5) return(FALSE)
pal5 <- gsub("^(.)(.)(.).*", "\\1\\2\\3\\2\\1", x)
x == pal5
}
palindrome5("radar")
palindrome5("radon")
The file course-description.txt contains a description of all the courses offered by the Statistics Department at ISU. Read this file and convert to a data frame with columns 'Course', 'Name', 'Description'. Extra points, if you successfully extract number of credits from the description as well.
file <- read.table("http://www.hofroe.net/stat579/course-description.txt", sep="\n")
fsplit <- strsplit(split="\\. ", as.character(file[,1]))
dframe <- ldply(fsplit, function(x) c(x[1], x[2], x[3], paste(x[-(1:5)], collapse=". ")))
names(dframe) <- c("Course", "Name", "Credit", "Description")
summary(dframe)
For the following gene sequence:
sequence <- "ATGGATTCTGGTATGTTCTAGCGCTTGCACCATCCCATTTAACTGTAAGAAGAATTGCACGGTCCCAATTGCTCGAGAGA TTTCTCTTTTACCTTTTTTTACTATTTTTCACTCTCCCATAACCTCCTATATTGACTGATCTGTAATAACCACGATATTA TTGGAATAAATAGGGGCTTGAAATTTGGAAAAAAAAAAAAACTGAAATATTTTCGTGATAAGTGATAGTGATATTCTTCT TTTATTTGCTACTGTTACTAAGTCTCATGTACTAACATCGATTGCTTCATTCTTTTTGTTGCTATATTATATGTTTAGAG GTTGCTGCTTTGGTTATTGATAACGGTTCTGGTATGTGTAAAGCCGGTTTTGCCGGTGACGACGCTCCTCGTGCTGTCTT CCCATCTATCGTCGGTAGACCAAGACACCAAGGTATCATGGTCGGTATGGGTCAAAAAGACTCCTACGTTGGTGATGAAG CTCAATCCAAGAGAGGTATCTTGACTTTACGTTACCCAATTGAACACGGTATTGTCACCAACTGGGACGATATGGAAAAG ATCTGGCATCATACCTTCTACAACGAATTGAGAGTTGCCCCAGAAGAACACCCTGTTCTTTTGACTGAAGCTCCAATGAA CCCTAAATCAAACAGAGAAAAGATGACTCAAATTATGTTTGAAACTTTCAACGTTCCAGCCTTCTACGTTTCCATCCAAG CCGTTTTGTCCTTGTACTCTTCCGGTAGAACTACTGGTATTGTTTTGGATTCCGGTGATGGTGTTACTCACGTCGTTCCA ATTTACGCTGGTTTCTCTCTACCTCACGCCATTTTGAGAATCGATTTGGCCGGTAGAGATTTGACTGACTACTTGATGAA GATCTTGAGTGAACGTGGTTACTCTTTCTCCACCACTGCTGAAAGAGAAATTGTCCGTGACATCAAGGAAAAACTATGTT ACGTCGCCTTGGACTTCGAACAAGAAATGCAAACCGCTGCTCAATCTTCTTCAATTGAAAAATCCTACGAACTTCCAGAT GGTCAAGTCATCACTATTGGTAACGAAAGATTCAGAGCCCCAGAAGCTTTGTTCCATCCTTCTGTTTTGGGTTTGGAATC TGCCGGTATTGACCAAACTACTTACAACTCCATCATGAAGTGTGATGTCGATGTCCGTAAGGAATTATACGGTAACATCG TTATGTCCGGTGGTACCACCATGTTCCCAGGTATTGCCGAAAGAATGCAAAAGGAAATCACCGCTTTGGCTCCATCTTCC ATGAAGGTCAAGATCATTGCTCCTCCAGAAAGAAAGTACTCCGTCTGGATTGGTGGTTCTATCTTGGCTTCTTTGACTAC CTTCCAACAAATGTGGATCTCAAAACAAGAATACGACGAAAGTGGTCCATCTATCGTTCACCACAAGTGTTTCTAA"find all 3 letter 'words' (a word is a sequence of the letters 'A', 'G', 'C', and 'T') and the frequency of their occurence
next3 <- function(x) {
if (nchar(x) <= 3) return(x)
substr(x, 1, 3)
}
seq <- gsub("\n", "", sequence)
res <- rep("", nchar(seq)-3)
for (i in 1:(nchar(seq)-3)) {
res[i] <- next3(seq)
seq <- substr(seq, 2, nchar(seq))
}
sort(table(res))
Write a function integral (n, a,b,f)
that estimates the area under function f
between limits a
and b
using n
uniform random numbers. The picture below shows a description of the process. In this example, 1000 pairs of random uniform values between [0.5,3.5] x [0,15] were used. 592 of them resulted in falling below the curve, giving an estimate for the are under curve as 0.592 * 3 * 15 = 26.64.
integral <- function(n, a,b,f) {
x <- runif(n, a,b)
y <- runif(n, 0, 15)
sum(f(x) < y)/n*(b-a)*15
}
Apply your function repeatedly (say, 20 times) to the curve f(x) = 2x(x-3)^2+5 between limits 1 and 4 using 2000 pairs of random numbers. Give an estimate for the area under the curve and a standard deviation.
f <- function(x) 2*x*(x-3)^2+5
res <- replicate(20, integral(2000, 1, 4, f))
mean(res)
sd(res)