summary row with gtsummary - gtsummary

I am trying to create a table of events with gtsummary and I would like to obtain a final row counting the events of the previous rows. add_overall() and add_n() do add the total but in a column, counting the same event across groups but not the overall events.
I created this example.
x1 <- sample(c("No", "Yes"), 30, replace = TRUE, prob = c(0.85, 0.15))
x2 <- sample(c("No", "Yes"), 30, replace = TRUE, prob = c(0.9, 0.1))
x3 <- sample(c("No", "Yes"), 30, replace = TRUE, prob = c(0.75, 0.25))
y <- sample(c("A", "B"), 30, replace = TRUE, prob = c(0.5, 0.5))
df <- data.frame(as_factor(x1), as_factor(x2), as_factor(x3), as_factor(y))
colnames(df) <-c("event_1", "event_2", "event_3", "group")
tbl_summary(df, by=group, statistic = all_categorical() ~ "{n}")
example
I tried using summary_rows() function from gt package after converting the table to a gt object but there is an error when summarising because these variables are factors.
Any other ideas?

You can do this by adding a new variable to your data frame that is the row sum of each of the events. Then you can display that variable's sum in the summary table. Example below!
library(gtsummary)
#> #Uighur
library(tidyverse)
df <-
data.frame(
event_1 = sample(c(FALSE, TRUE), 30, replace = TRUE, prob = c(0.85, 0.15)),
event_2 = sample(c(FALSE, TRUE), 30, replace = TRUE, prob = c(0.9, 0.1)),
event_3 = sample(c(FALSE, TRUE), 30, replace = TRUE, prob = c(0.75, 0.25)),
group = sample(c("A", "B"), 30, replace = TRUE, prob = c(0.5, 0.5))
) |>
rowwise() |>
mutate(Total = sum(event_1, event_2, event_3))
tbl_summary(
df,
by = group,
type = Total ~ "continuous",
statistic =
list(all_categorical() ~ "{n}",
all_continuous() ~ "{sum}")
) |>
as_kable() # convert to kable to display on stack overflow
Characteristic
A, N = 16
B, N = 14
event_1
4
4
event_2
1
2
event_3
7
6
Total
12
12
Created on 2023-01-12 with reprex v2.0.2

Thank you so much (great package gtsummary). That works! I had some trouble summing over factors. If variables are factors the code
mutate(Total = sum(event_1=="Yes", event_2=="Yes", event_3=="Yes"))
does it.

Related

Polars Dataframe: Apply MinMaxScaler to a column with condition

I am trying to perform the following operation in Polars.
For value in column B which is below 80 will be scaled between 1 and 4, where as for anything above 80, will be set as 5.
df_pandas = pd.DataFrame(
{
"A": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"B": [50, 300, 80, 12, 105, 78, 66, 42, 61.5, 35],
}
)
test_scaler = MinMaxScaler(feature_range=(1,4)) # from sklearn.preprocessing
df_pandas.loc[df_pandas['B']<80, 'Test'] = test_scaler.fit_transform(df_pandas.loc[df_pandas['B']<80, "B"].values.reshape(-1,1))
df_pandas = df_pandas.fillna(5)
This is what I did with Polars:
# dt is a dictionary
dt = df.filter(
pl.col('B')<80
).to_dict(as_series=False)
below_80 = list(dt.keys())
dt_scale = list(
test_scaler.fit_transform(
np.array(dt['B']).reshape(-1,1)
).reshape(-1) # reshape back to one dimensional
)
# reassign to dictionary dt
dt['B'] = dt_scale
dt_scale_df = pl.DataFrame(dt)
dt_scale_df
dummy = df.join(
dt_scale_df, how="left", on="A"
).fill_null(5)
dummy = dummy.rename({"B_right": "Test"})
Result:
A
B
Test
1
50.0
2.727273
2
300.0
5.000000
3
80.0
5.000000
4
12.0
1.000000
5
105.0
5.000000
6
78.0
4.000000
7
66.0
3.454545
8
42.0
2.363636
9
61.5
3.250000
10
35.0
2.045455
Is there a better approach for this?
Alright, I have got 3 examples for you that should help you from which the last should be preferred.
Because you only want to apply your scaler to a part of a column, we should ensure we only send that part of the data to the scaler. This can be done by:
window function over a partition
partition_by
when -> then -> otherwise + min_max expression
Window function over partititon
This requires a python function that will be applied over the partitions. In the function itself we then have to check in which partition we are and deal with it accordingly.
df = pl.from_pandas(df_pandas)
min_max_sc = MinMaxScaler((1, 4))
def my_scaler(s: pl.Series) -> pl.Series:
if s.len() > 0 and s[0] > 80:
out = (s * 0 + 5)
else:
out = pl.Series(min_max_sc.fit_transform(s.to_numpy().reshape(-1, 1)).flatten())
# ensure all types are the same
return out.cast(pl.Float64)
df.with_column(
pl.col("B").apply(my_scaler).over(pl.col("B") < 80).alias("Test")
)
partition_by
This partitions the the original dataframe to a dictionary holding the different partitions. We then only modify the partitions as needed.
parts = (df
.with_column((pl.col("B") < 80).alias("part"))
.partition_by("part", as_dict=True)
)
parts[True] = parts[True].with_column(
pl.col("B").map(
lambda s: pl.Series(min_max_sc.fit_transform(s.to_numpy().reshape(-1, 1)).flatten())
).alias("Test")
)
parts[False] = parts[False].with_column(
pl.lit(5.0).alias("Test")
)
pl.concat([df for df in parts.values()]).select(pl.all().exclude("part"))
when -> then -> otherwise + min_max expression
This one I like best. We can make function that creates a polars expression that is the min_max scaling function you need. This will have best performance.
def min_max_scaler(col: str, predicate: pl.Expr):
x = pl.col(col)
x_min = x.filter(predicate).min()
x_max = x.filter(predicate).max()
# * 3 + 1 to set scale between 1 - 4
return (x - x_min) / (x_max - x_min) * 3 + 1
predicate = pl.col("B") < 80
df.with_column(
pl.when(predicate)
.then(min_max_scaler("B", predicate))
.otherwise(5).alias("Test")
)

mutlievel data simulation using simglm: how to i simulate random effects at level 1

I want to add random effects at level 1. Below is the working code with level 2 simulated.
I have two questions i hope folks can help with
How do i get a reasonable estimate of level 2 variance (assuming i have sample data). Can i just square the between person SD on the dv?
how do simulate level 1 variance and how do i determine a reasonable value at level 1.
I've tried: randomeffect = list(int_neighborhood = list(variance = 8, var_level = 2),
weight= list(variance = 8, var_level = 1 ))
but that kicks an error
This code works without level 1
ctrl <- lmeControl(opt='optim');
sim_arguments <- list(
formula = y ~ 1 + weight + age + sex + (1 | neighborhood),
reg_weights = c(4, -0.03, 0.2, 0.33),
fixed = list(weight = list(var_type = 'continuous', mean = 180, sd = 30),
age = list(var_type = 'ordinal', levels = 30:60),
sex = list(var_type = 'factor', levels = c('male', 'female'))),
randomeffect = list(int_neighborhood = list(variance = 8, var_level = 2)),
sample_size = list(level1 = 62, level2 = 60)
)
nested_data <- sim_arguments %>%
simulate_fixed(data = NULL, .) %>%
simulate_randomeffect(sim_arguments) %>%
simulate_error(sim_arguments) %>%
generate_response(sim_arguments)
RandomIntercept <- lme(fixed= y ~1 + weight + age + sex ,
random= ~ 1 | neighborhood,
correlation = corAR1(),
data=nested_data,
control=ctrl,
na.action=na.exclude)
summary(RandomIntercept)
RandomSlope <-lme(fixed= y ~1 + weight + age + sex ,
random= ~ 1 +weight| neighborhood,
correlation = corAR1(),
data=nested_data,
control=ctrl,
na.action=na.exclude)
summary(RandomSlope)
anova(RandomIntercept,RandomSlope)

Correcting satellite image overlays for Rayshader

I'm trying to improve the look of Rayshader by overlaying more recent (higher detail) satellite imagery (that I'm getting from the {leaflet} packages) but the overlay doesn't match with the 3D rendering.
Ideally I'm looking for a open-source solution that can get global satellite imagery. Bonus points if you find finer detail data for my area of interest - Hawaii.
One method using {geoviz} and {rayshader} uses the slippy_overlay() function to create a number of overlay images from either Mapbox (satellite, mapbox-streets-v8, mapbox-terrain-v2, mapbox-traffic-v1, terrain-rgb, mapbox-incidents-v1) or Stamen. Although I found mapbox-terrain-v2 the best it still lacks the detail I would like. Since it requires setting up an API for mapbox I just use stamen/watercolor below:
library(geoviz)
library(rayshader)
### Maui
lat = 20.785700
lon = -156.259204
square_km = 22
max_tiles = 10
dem <- mapzen_dem(lat, lon, square_km, max_tiles)
elev_matrix = matrix(
raster::extract(dem, raster::extent(dem), buffer=1000),
nrow = ncol(dem),
ncol = nrow(dem)
)
ambmat <- ambient_shade(elev_matrix, zscale = 30)
raymat <- ray_shade(elev_matrix, zscale = 30, lambert = TRUE)
watermap <- detect_water(elev_matrix)
overlay_img <-
slippy_overlay(dem,
image_source = "stamen",
image_type = "watercolor",
png_opacity = 0.3,
max_tiles = max_tiles)
elev_matrix %>%
sphere_shade(sunangle = 270, texture = "imhof4") %>%
add_water(detect_water(elev_matrix), color="imhof4") %>%
add_shadow(ray_shade(elev_matrix,zscale=3,maxsearch = 300),0.5) %>%
add_shadow(ambmat,0.5) %>%
add_overlay(overlay_img) %>%
plot_3d(elev_matrix,
solid = T,
water = T,
waterdepth = 0,
wateralpha = 0.5,
watercolor = "lightblue",
waterlinecolor = "white",
waterlinealpha = 0.5,
zscale= raster_zscale(dem) / 3,
fov=0,theta=135,zoom=0.75,phi=45, windowsize = c(1000,800))
I'm trying to adapt Will Bishop's workflow for getting overlays with the leaflet package but the result is very odd. Will's approach is a bit different as it fetches elevation data from USGS, which doesn't have baythmetric elevation which is must for me - so I used geoviz
library(leaflet)
# define bounding box with longitude/latitude coordinates
bbox <- list(
p1 = list(long = -156.8037, lat = 20.29737),
p2 = list(long = -155.7351, lat = 21.29577)
)
leaflet() %>%
addTiles() %>%
addRectangles(
lng1 = bbox$p1$long, lat1 = bbox$p1$lat,
lng2 = bbox$p2$long, lat2 = bbox$p2$lat,
fillColor = "transparent"
) %>%
fitBounds(
lng1 = bbox$p1$long, lat1 = bbox$p1$lat,
lng2 = bbox$p2$long, lat2 = bbox$p2$lat,
)
What's the area of my hillshade from geoviz?
dim(dem)
780 780 1
Okay so the overlay image needs to be 780 x 780 so I modify the helper functions to download the overlay with the World_Imagery base map:
define_image_size <- function(bbox, major_dim = 780) {
# calculate aspect ration (width/height) from lat/long bounding box
aspect_ratio <- abs((bbox$p1$long - bbox$p2$long) / (bbox$p1$lat - bbox$p2$lat))
# define dimensions
img_width <- ifelse(aspect_ratio > 1, major_dim, major_dim*aspect_ratio) %>% round()
img_height <- ifelse(aspect_ratio < 1, major_dim, major_dim/aspect_ratio) %>% round()
size_str <- paste(img_width, img_height, sep = ",")
list(height = img_height, width = img_width, size = size_str)
}
get_arcgis_map_image <- function(bbox, map_type = "World_Imagery", file = NULL,
width = 780, height = 780, sr_bbox = 4326) {
require(httr)
require(glue)
require(jsonlite)
url <- parse_url("https://utility.arcgisonline.com/arcgis/rest/services/Utilities/PrintingTools/GPServer/Export%20Web%20Map%20Task/execute")
# define JSON query parameter
web_map_param <- list(
baseMap = list(
baseMapLayers = list(
list(url = jsonlite::unbox(glue("https://services.arcgisonline.com/ArcGIS/rest/services/{map_type}/MapServer",
map_type = map_type)))
)
),
exportOptions = list(
outputSize = c(width, height)
),
mapOptions = list(
extent = list(
spatialReference = list(wkid = jsonlite::unbox(sr_bbox)),
xmax = jsonlite::unbox(max(bbox$p1$long, bbox$p2$long)),
xmin = jsonlite::unbox(min(bbox$p1$long, bbox$p2$long)),
ymax = jsonlite::unbox(max(bbox$p1$lat, bbox$p2$lat)),
ymin = jsonlite::unbox(min(bbox$p1$lat, bbox$p2$lat))
)
)
)
res <- GET(
url,
query = list(
f = "json",
Format = "PNG32",
Layout_Template = "MAP_ONLY",
Web_Map_as_JSON = jsonlite::toJSON(web_map_param))
)
if (status_code(res) == 200) {
body <- content(res, type = "application/json")
message(jsonlite::toJSON(body, auto_unbox = TRUE, pretty = TRUE))
if (is.null(file))
file <- tempfile("overlay_img", fileext = ".png")
img_res <- GET(body$results[[1]]$value$url)
img_bin <- content(img_res, "raw")
writeBin(img_bin, file)
message(paste("image saved to file:", file))
} else {
message(res)
}
invisible(file)
}
Now download the file, then load it
image_size <- define_image_size(bbox, major_dim = 780)
# fetch overlay image
overlay_file <- "maui_overlay.png"
get_arcgis_map_image(bbox, map_type = "World_Imagery", file = overlay_file,
# width = image_size$width, height = image_size$height,
sr_bbox = 4326)
overlay_img <- png::readPNG("maui_overlay.png")
Okay let's make the plot
elev_matrix %>%
sphere_shade(sunangle = 270, texture = "imhof4") %>%
add_water(detect_water(elev_matrix), color="imhof4") %>%
add_shadow(ray_shade(elev_matrix,zscale=3,maxsearch = 300),0.5) %>%
add_shadow(ambmat,0.5) %>%
add_overlay(overlay_img, alphacolor = 1) %>%
plot_3d(elev_matrix,
solid = T,
water = T,
waterdepth = 0,
wateralpha = 0.5,
watercolor = "lightblue",
waterlinecolor = "white",
waterlinealpha = 0.5,
zscale= raster_zscale(dem) / 3,
fov=0,theta=135,zoom=0.75,phi=45, windowsize = c(1000,800))
As you can see the overlay image is rotated to the hillshade.
Now I'm also realizing that fetching satellite with a bounding box method isn't ideal when you're trying to show bathymatrix data. It would be ideal to subset this overlay somehow programmatically but I'll probably just end up using inkscape once I've figured out how to rotate the overlay.
I tried to use the {magick}'s image_rotate() function to no avail:
library(magick)
maui <- magick::image_read("maui_overlay.png")
image_rotate(maui, 30) # -> maui_30
# image_write(maui_30, path = "maui_overlay_30.png", format = "png")
But magick has changed the dimensions:
# A tibble: 1 x 7
format width height colorspace matte filesize density
<chr> <int> <int> <chr> <lgl> <int> <chr>
1 PNG 1068 1068 sRGB TRUE 0 38x38
And will give an error with rayshader:
overlay_img <- png::readPNG("maui_overlay_30.png")
elev_matrix %>%
sphere_shade(sunangle = 270, texture = "imhof4") %>%
add_water(detect_water(elev_matrix), color="imhof4") %>%
add_shadow(ray_shade(elev_matrix,zscale=3,maxsearch = 300),0.5) %>%
add_shadow(ambmat,0.5) %>%
add_overlay(overlay_img, alphacolor = 1) %>%
plot_3d(elev_matrix,
solid = T,
water = T,
waterdepth = 0,
wateralpha = 0.5,
watercolor = "lightblue",
waterlinecolor = "white",
waterlinealpha = 0.5,
zscale= raster_zscale(dem) / 3,
fov=0,theta=135,zoom=0.75,phi=45, windowsize = c(1000,800))
Error in add_overlay(., overlay_img, alpha = 0.8) : argument 3 matches multiple formal arguments
The answer couldn't have been simpler... it needed to be transposed overlay_img = aperm(overlay_img, c(2,1,3)).

Group_by returns just one row while aggregate returns the expected outcome

I am currently stuck at the post-processing of some EddyData. Following an example (https://github.com/bgctw/REddyProc/blob/master/vignettes/aggUncertainty.md) I came up with an unexpected outcome of group_by which is reproducible but I don't understand why.
Group_by returns just one row while aggregate gives the expected outcome.
Here is a minimal example:
library(tidyverse)
#create example data frame
date.time <- seq(from=as.POSIXct("2015-01-01 00:30:00"), to=as.POSIXct("2015-01-03 00:30:00"),by="30 mins")
nee <- runif(length(date.time),-200,200)
df <- data.frame(date.time, nee)
#calculate day of the year
df <- df %>% mutate(
date.time = df$date.time
, DoY = as.POSIXlt(date.time - 15*60)$yday # midnight belongs to the previous
)
#trying to summarise nee for each day
aggDay <- df %>% group_by(DoY) %>% summarise(nee=sum(nee))
aggDay
nee
1 322.1195
aggDay just returns one row while aggregate would work in this case
aggregate(df$nee, by=list(df$DoY), sum)
Group.1 x
1 0 -25.15698
2 1 448.13960
3 2 -100.86310
Unfortunately, the original code involves some further calculations which is the reason why I'd like to stay with group_by.
#original code, not reproducible here
aggDay <- df %>% group_by(DoY) %>%
summarise(
DateTime = first(DateTime)
, nRec = sum( NEE_uStar_fqc == 0, na.rm = TRUE)
, nEff = computeEffectiveNumObs(
resid, effAcf = !!autoCorr, na.rm = TRUE)
, NEE = mean(NEE_uStar_f, na.rm = TRUE)
, sdNEE = if (nEff <= 1) NA_real_ else sqrt(
mean(NEE_uStar_fsd^2, na.rm = TRUE) / (nEff - 1))
, sdNEEuncorr = if (nRec == 0) NA_real_ else sqrt(
mean(NEE_uStar_fsd^2, na.rm = TRUE) / (nRec - 1))
)
I restarted RStudio and now it works. Don't ask me. There must have been a problem with another loaded package.

different clusters with same method

I am stuck in a problem with hierarchical clustering. I want to make a dendrogram and a heatmap, with a distance method of correlation (d_mydata=dist(1-cor(t(mydata))) and ward.D2 as clustering method.
As a gadget in the package pheatmap you can plot the dendrogram on the left side to visualize the clusters.
The pipeline of my analysis would be this:
create the dendrogram
test how many cluster would be the optimal (k)
extract the subjects in each cluster
create a heatmap
My surprise comes up when the dendrogram plotted in the heatmap is not the same as the one plotted before even when methods are the same.
So I decided to create a pheatmap colouring by the clusters classified before by cutree and test if the colours correspond to the clusters in the dendrogram.
This is my code:
# Create test matrix
test = matrix(rnorm(200), 20, 10)
test[1:10, seq(1, 10, 2)] = test[1:10, seq(1, 10, 2)] + 3
test[11:20, seq(2, 10, 2)] = test[11:20, seq(2, 10, 2)] + 2
test[15:20, seq(2, 10, 2)] = test[15:20, seq(2, 10, 2)] + 4
colnames(test) = paste("Test", 1:10, sep = "")
rownames(test) = paste("Gene", 1:20, sep = "")
test<-as.data.frame(test)
# Create a dendrogram with this test matrix
dist_test<-dist(test)
hc=hclust(dist_test, method="ward.D2")
plot(hc)
dend<-as.dendrogram(hc, check=F, nodePar=list(cex = .000007),leaflab="none", cex.main=3, axes=F, adjust=F)
clus2 <- as.factor(cutree(hc, k=2)) # cut tree into 2 clusters
groups<-data.frame(clus2)
groups$id<-rownames(groups)
#-----------DATAFRAME WITH mydata AND THE CLASSIFICATION OF CLUSTERS AS FACTORS---------------------
test$id<-rownames(test)
clusters<-merge(groups, test, by.x="id")
rownames(clusters)<-clusters$id
clusters$clus2<-as.character(clusters$clus2)
clusters$clus2[clusters$clus2== "1"]= "cluster1"
clusters$clus2[clusters$clus2=="2"]<-"cluster2"
plot(dend,
main = "test",
horiz = TRUE, leaflab = "none")
d_clusters<-dist(1-cor(t(clusters[,7:10])))
hc_cl=hclust(d_clusters, method="ward.D2")
annotation_col = data.frame(
Path = factor(colnames(clusters[3:12]))
)
rownames(annotation_col) = colnames(clusters[3:12])
annotation_row = data.frame(
Group = factor(clusters$clus2)
)
rownames(annotation_row) = rownames(clusters)
# Specify colors
ann_colors = list(
Path= c(Test1="darkseagreen", Test2="lavenderblush2", Test3="lightcyan3", Test4="mediumpurple", Test5="red", Test6="blue", Test7="brown", Test8="pink", Test9="black", Test10="grey"),
Group = c(cluster1="yellow", cluster2="blue")
)
require(RColorBrewer)
library(RColorBrewer)
cols <- colorRampPalette(brewer.pal(10, "RdYlBu"))(20)
library(pheatmap)
pheatmap(clusters[ ,3:12], color = rev(cols),
scale = "column",
kmeans_k = NA,
show_rownames = F, show_colnames = T,
main = "Heatmap CK14, CK5/6, GATA3 and FOXA1 n=492 SCALE",
clustering_method = "ward.D2",
cluster_rows = TRUE, cluster_cols = TRUE,
clustering_distance_rows = "correlation",
clustering_distance_cols = "correlation",
annotation_row = annotation_row,
annotation_col = annotation_col,
annotation_colors=ann_colors
)
anyone with the same issue? Am I making an stupid mistake?
Thank you in advance