############################################### ###### Cleaning and filtering valid data ###### ############################################### ## Run the following in the database (order matters) # 0. Compute IKI for each record in KEYSTROKES (~ 16 mins) ALTER TABLE KEYSTROKES ADD COLUMN IKI DOUBLE; select @prev:=PRESS_TIME from KEYSTROKES ORDER BY PRESS_TIME ASC, TEST_SECTION_ID ASC LIMIT 1,1; update KEYSTROKES k set k.IKI = ( select pk.IKI from ( select ks.KEYSTROKE_ID, ks.PRESS_TIME, @prev, ks.PRESS_TIME - @prev as IKI, @prev:=ks.PRESS_TIME from KEYSTROKES ks ORDER BY ks.PRESS_TIME ASC, ks.TEST_SECTION_ID ASC ) as pk where k.KEYSTROKE_ID = pk.KEYSTROKE_ID ) # 1.a. Define and select valid participants CREATE TABLE tmp LIKE PARTICIPANTS; INSERT INTO tmp SELECT * FROM PARTICIPANTS WHERE AGE IS NOT NULL AND WPM > 0 AND WPM < 200 AND ERROR_RATE < 25 AND DEVICE ='mobile' AND PARTICIPANT_ID > 607 # 1.b. Calculate 2SD from mean age. SELECT @Std := 2*STD(age), @Avg := AVG(age) FROM tmp # 1.c. Use only data with age within 2SD from the mean. CREATE TABLE VALID_PARTICIPANTS LIKE PARTICIPANTS; INSERT INTO VALID_PARTICIPANTS SELECT * FROM tmp WHERE AGE > 5 AND AGE < @Avg + @Std # 2. Select test sections from valid participants EXCLUDING test sections with IKI > 5000 # Select only data with non-negative WPM and not-null USER_INPUT CREATE TABLE VALID_TEST_SECTIONS LIKE TEST_SECTIONS; INSERT INTO VALID_TEST_SECTIONS SELECT * FROM TEST_SECTIONS WHERE PARTICIPANT_ID IN ( SELECT PARTICIPANT_ID FROM VALID_PARTICIPANTS ) AND WPM > 0 AND WPM < 200 AND ERROR_RATE < 25 AND USER_INPUT IS NOT NULL AND TEST_SECTION_ID NOT IN ( SELECT DISTINCT TEST_SECTION_ID FROM KEYSTROKES WHERE IKI > 5000 ) # 3. Make sure there are at least 15 test sections for each valid participant # If this query returns non-zero number of rows, perform steps 4 and 5. SELECT * FROM ( SELECT PARTICIPANT_ID, COUNT(TEST_SECTION_ID) AS TS_COUNT FROM VALID_TEST_SECTIONS GROUP BY PARTICIPANT_ID ORDER BY TS_COUNT ASC) AS tmp WHERE tmp.TS_COUNT < 15 # 4. Update VALID_PARTICIPANTS to discard those with less than 15 valid test sections DROP TABLE VALID_PARTICIPANTS; CREATE TABLE VALID_PARTICIPANTS LIKE PARTICIPANTS; INSERT INTO VALID_PARTICIPANTS SELECT * FROM PARTICIPANTS WHERE PARTICIPANT_ID IN ( SELECT tmp.PARTICIPANT_ID FROM ( SELECT PARTICIPANT_ID, COUNT(TEST_SECTION_ID) AS TS_COUNT FROM VALID_TEST_SECTIONS GROUP BY PARTICIPANT_ID ORDER BY TS_COUNT ASC) AS tmp WHERE tmp.TS_COUNT >= 15 ); # 5. Update VALID_TEST_SECTIONS to match VALID_PARTICIPANTS DROP TABLE VALID_TEST_SECTIONS; CREATE TABLE VALID_TEST_SECTIONS LIKE TEST_SECTIONS; INSERT INTO VALID_TEST_SECTIONS SELECT * FROM TEST_SECTIONS WHERE PARTICIPANT_ID IN ( SELECT PARTICIPANT_ID FROM VALID_PARTICIPANTS ) AND WPM > 0 AND WPM < 200 AND ERROR_RATE < 25 AND USER_INPUT IS NOT NULL AND TEST_SECTION_ID NOT IN ( SELECT DISTINCT TEST_SECTION_ID FROM KEYSTROKES WHERE IKI > 5000 ) # 6. Double-check step 3. Should return zero rows now. Remove tmp. ############################################ ###### Computing perfornamce measures ###### ############################################ ALTER TABLE VALID_TEST_SECTIONS ADD COLUMN TS_KPD DOUBLE; UPDATE VALID_TEST_SECTIONS ts set TS_KPD=( SELECT AVG(k.RELEASE_TIME - k.PRESS_TIME) from KEYSTROKES k where k.TEST_SECTION_ID = ts.TEST_SECTION_ID ) ALTER TABLE VALID_TEST_SECTIONS ADD COLUMN TS_IKI DOUBLE; UPDATE VALID_TEST_SECTIONS ts set TS_IKI=( SELECT AVG(k.IKI) from KEYSTROKES k where k.TEST_SECTION_ID = ts.TEST_SECTION_ID ) ALTER TABLE VALID_TEST_SECTIONS ADD COLUMN TS_BSP DOUBLE; UPDATE VALID_TEST_SECTIONS ts set TS_BSP =( SELECT COUNT(k.KEYSTROKE_ID) from KEYSTROKES k where k.TEST_SECTION_ID = ts.TEST_SECTION_ID AND k.LETTER = 'BKSP' ) ALTER TABLE VALID_TEST_SECTIONS ADD COLUMN TS_ECPC DOUBLE; UPDATE VALID_TEST_SECTIONS ts set TS_ECPC =( ts.TS_BSP / ts.INPUT_LENGTH ) ALTER TABLE VALID_TEST_SECTIONS ADD COLUMN TS_KS INT; UPDATE VALID_TEST_SECTIONS ts set TS_KS =( SELECT COUNT(k.KEYSTROKE_ID) from KEYSTROKES k where k.TEST_SECTION_ID = ts.TEST_SECTION_ID ) ALTER TABLE VALID_TEST_SECTIONS ADD COLUMN TS_KSPC DOUBLE; UPDATE VALID_TEST_SECTIONS ts set TS_KSPC =( ts.TS_KS / ts.TS_UILEN ) # Include only first 15 sentences from valid participants. CREATE TABLE FIRST_15_TEST_SECTIONS LIKE VALID_TEST_SECTIONS; INSERT INTO FIRST_15_TEST_SECTIONS SELECT * FROM VALID_TEST_SECTIONS WHERE TEST_SECTION_ID IN ( SELECT TEST_SECTION_ID FROM ( SELECT @type_number := IF(@PARTICIPANT_ID = PARTICIPANT_ID, @type_number + 1, 1) AS type_number, @PARTICIPANT_ID := PARTICIPANT_ID as PARTICIPANT_ID, TEST_SECTION_ID FROM (SELECT @type_number := 1) x, (SELECT TEST_SECTION_ID, @PARTICIPANT_ID := PARTICIPANT_ID as PARTICIPANT_ID FROM VALID_TEST_SECTIONS WHERE TS_IKI < 5000 ORDER BY PARTICIPANT_ID) y ) z WHERE type_number <= 15 ); ALTER TABLE VALID_PARTICIPANTS ADD COLUMN P_KPD DOUBLE; ALTER TABLE VALID_PARTICIPANTS ADD COLUMN P_IKI DOUBLE; ALTER TABLE VALID_PARTICIPANTS ADD COLUMN P_BSP DOUBLE; ALTER TABLE VALID_PARTICIPANTS ADD COLUMN P_ECPC DOUBLE; ALTER TABLE VALID_PARTICIPANTS ADD COLUMN P_UILEN DOUBLE; ALTER TABLE VALID_PARTICIPANTS ADD COLUMN P_KSPC DOUBLE; UPDATE VALID_PARTICIPANTS p set P_KPD =( SELECT AVG(ts.TS_KPD) from FIRST_15_TEST_SECTIONS ts where ts.PARTICIPANT_ID = p.PARTICIPANT_ID ); UPDATE VALID_PARTICIPANTS p set P_IKI =( SELECT AVG(ts.TS_IKI) from FIRST_15_TEST_SECTIONS ts where ts.PARTICIPANT_ID = p.PARTICIPANT_ID ); UPDATE VALID_PARTICIPANTS p set P_BSP =( SELECT AVG(ts.TS_BSP) from FIRST_15_TEST_SECTIONS ts where ts.PARTICIPANT_ID = p.PARTICIPANT_ID ); UPDATE VALID_PARTICIPANTS p set P_ECPC =( SELECT AVG(ts.TS_ECPC) from FIRST_15_TEST_SECTIONS ts where ts.PARTICIPANT_ID = p.PARTICIPANT_ID ); UPDATE VALID_PARTICIPANTS p set P_UILEN =( SELECT AVG(ts.TS_UILEN) from FIRST_15_TEST_SECTIONS ts where ts.PARTICIPANT_ID = p.PARTICIPANT_ID ); UPDATE VALID_PARTICIPANTS p set P_KSPC =( SELECT AVG(ts.TS_KSPC) from FIRST_15_TEST_SECTIONS ts where ts.PARTICIPANT_ID = p.PARTICIPANT_ID ); ########################################### ###### Computing intelligent metrics ###### ########################################### CREATE TABLE VALID_INPUT_LOGDATA LIKE LOG_DATA; INSERT INTO VALID_INPUT_LOGDATA SELECT * FROM LOG_DATA WHERE TEST_SECTION_ID IN (SELECT TEST_SECTION_ID from FIRST_15_TEST_SECTIONS) and TYPE='input' # INPUT_LEN # update VALID_INPUT_LOGDATA set INPUT_LEN = LENGTH(INPUT) where TEST_SECTION_ID in (select TEST_SECTION_ID from FIRST_15_TEST_SECTIONS); # INPUT_LDIST # select @prev:=INPUT from VALID_INPUT_LOGDATA where TEST_SECTION_ID in (select TEST_SECTION_ID from FIRST_15_TEST_SECTIONS) ORDER BY TEST_SECTION_ID ASC, TIMESTAMP ASC LIMIT 1,1; update VALID_INPUT_LOGDATA ld set ld.INPUT_LDIST = ( select ldt.LDIST from ( select vld.LOG_DATA_ID, vld.TEST_SECTION_ID, @prev, vld.INPUT, levenshtein(vld.INPUT, @prev) as LDIST, @prev:=vld.INPUT from VALID_INPUT_LOGDATA vld where vld.TEST_SECTION_ID in (select TEST_SECTION_ID from FIRST_15_TEST_SECTIONS) ORDER BY vld.TEST_SECTION_ID ASC, vld.TIMESTAMP ASC ) as ldt where ld.LOG_DATA_ID = ldt.LOG_DATA_ID ) where TEST_SECTION_ID in (select TEST_SECTION_ID from FIRST_15_TEST_SECTIONS); # INPUT_PRLEN # select @prev:=INPUT_LEN from VALID_INPUT_LOGDATA where TEST_SECTION_ID in (select TEST_SECTION_ID from FIRST_15_TEST_SECTIONS) ORDER BY TEST_SECTION_ID ASC, TIMESTAMP ASC LIMIT 1,1; update VALID_INPUT_LOGDATA ld set ld.INPUT_PRLEN = ( select ldt.PREV from ( select vld.LOG_DATA_ID, vld.TEST_SECTION_ID, @prev as PREV, vld.INPUT_LEN, @prev:=vld.INPUT_LEN from VALID_INPUT_LOGDATA vld where vld.TEST_SECTION_ID in (select TEST_SECTION_ID from FIRST_15_TEST_SECTIONS) ORDER BY vld.TEST_SECTION_ID ASC, vld.TIMESTAMP ASC ) as ldt where ld.LOG_DATA_ID = ldt.LOG_DATA_ID ) where TEST_SECTION_ID in (select TEST_SECTION_ID from FIRST_15_TEST_SECTIONS); # INPUT_PRLDIST # select @prev:=INPUT_LDIST from VALID_INPUT_LOGDATA where TEST_SECTION_ID in (select TEST_SECTION_ID from FIRST_15_TEST_SECTIONS) ORDER BY TEST_SECTION_ID ASC, TIMESTAMP ASC LIMIT 1,1; update VALID_INPUT_LOGDATA ld set ld.INPUT_PRLDIST = ( select ldt.PREV from ( select vld.LOG_DATA_ID, vld.TEST_SECTION_ID, @prev as PREV, vld.INPUT_LDIST, @prev:=vld.INPUT_LDIST from VALID_INPUT_LOGDATA vld where vld.TEST_SECTION_ID in (select TEST_SECTION_ID from FIRST_15_TEST_SECTIONS) ORDER BY vld.TEST_SECTION_ID ASC, vld.TIMESTAMP ASC ) as ldt where ld.LOG_DATA_ID = ldt.LOG_DATA_ID ) where TEST_SECTION_ID in (select TEST_SECTION_ID from FIRST_15_TEST_SECTIONS); # INPUT_LC (last character) # ALTER TABLE VALID_INPUT_LOGDATA ADD COLUMN INPUT_LC CHAR; update VALID_INPUT_LOGDATA set INPUT_LC = case when RIGHT(INPUT, 1) = '"' then REPLACE(RIGHT(INPUT, 1), '"', '_') when RIGHT(INPUT, 1) = "'" then REPLACE(RIGHT(INPUT, 1), "'", '_') when RIGHT(INPUT, 1) = ' ' then REPLACE(RIGHT(INPUT, 1), ' ', '_') else 0 end where TEST_SECTION_ID in (select TEST_SECTION_ID from FIRST_15_TEST_SECTIONS); # INPUT_PRLC (last character of previous input) # ALTER TABLE VALID_INPUT_LOGDATA ADD COLUMN INPUT_PRLC CHAR; select @prev:=INPUT_PRLC from VALID_INPUT_LOGDATA where TEST_SECTION_ID in (select TEST_SECTION_ID from FIRST_15_TEST_SECTIONS) ORDER BY TEST_SECTION_ID ASC, TIMESTAMP ASC LIMIT 1,1; update VALID_INPUT_LOGDATA ld set ld.INPUT_PRLC = ( select ldt.PREV from ( select vld.LOG_DATA_ID, vld.TEST_SECTION_ID, @prev as PREV, vld.INPUT_LC, @prev:=vld.INPUT_LC from VALID_INPUT_LOGDATA vld where vld.TEST_SECTION_ID in (select TEST_SECTION_ID from FIRST_15_TEST_SECTIONS) ORDER BY vld.TEST_SECTION_ID ASC, vld.TIMESTAMP ASC ) as ldt where ld.LOG_DATA_ID = ldt.LOG_DATA_ID ) where TEST_SECTION_ID in (select TEST_SECTION_ID from FIRST_15_TEST_SECTIONS); # Classify Intelligent Text entry # If more than one characher changed, more than one character inputted, and last character is a whitespace, or the last word was also inputted as a whole, or this is the first word of the sentence -> gesture update VALID_INPUT_LOGDATA set ITE_SWYP = case when INPUT_LDIST > 1 and INPUT_LEN >= INPUT_PRLEN + 1 and ((INPUT_PRLDIST = 1 and INPUT_PRLC = '_') or (INPUT_PRLDIST > 1 and INPUT_PRLC != '_')) or (INPUT_LEN > 1 and INPUT_PRLEN > INPUT_LEN+15) then 1 else 0 end where TEST_SECTION_ID in (select TEST_SECTION_ID from FIRST_15_TEST_SECTIONS); # If more than one characher changed, more than one character inputted, and last character is not whitespace or dot -> prediction update VALID_INPUT_LOGDATA set ITE_PRED = case when INPUT_LDIST > 1 and INPUT_PRLDIST = 1 and INPUT_LEN > INPUT_PRLEN + 1 and INPUT_PRLC != '_' and INPUT_LC != '.' then 1 else 0 end where TEST_SECTION_ID in (select TEST_SECTION_ID from FIRST_15_TEST_SECTIONS); # If more than one characher changed, one or more character inputted, the lenght of the word is unchanged or changed a little, or one character changed, but the word length did not change -> autocorrection update VALID_INPUT_LOGDATA set ITE_AUTO = case when (INPUT_LDIST > 1 and INPUT_PRLDIST = 1 and INPUT_LEN >= INPUT_PRLEN and INPUT_LEN <= INPUT_PRLEN + 1) or (INPUT_LDIST = 1 and INPUT_LEN = INPUT_PRLEN and INPUT_LC != '.') then 1 else 0 end where TEST_SECTION_ID in (select TEST_SECTION_ID from FIRST_15_TEST_SECTIONS); # ITE, number of detected ITE per test section # UPDATE FIRST_15_TEST_SECTIONS ts set N_SWYP =( SELECT SUM(ld.ITE_SWYP) from VALID_INPUT_LOGDATA ld where ld.TEST_SECTION_ID = ts.TEST_SECTION_ID ) UPDATE FIRST_15_TEST_SECTIONS ts set N_PRED =( SELECT SUM(ld.ITE_PRED) from VALID_INPUT_LOGDATA ld where ld.TEST_SECTION_ID = ts.TEST_SECTION_ID ) UPDATE FIRST_15_TEST_SECTIONS ts set N_AUTO =( SELECT SUM(ld.ITE_AUTO) from VALID_INPUT_LOGDATA ld where ld.TEST_SECTION_ID = ts.TEST_SECTION_ID ) # ITE, proportion of detected ITE per test section (number of ITE divided by number of words) # UPDATE FIRST_15_TEST_SECTIONS ts set PR_SWYP =( N_SWYP / TS_NUM_WORDS ) UPDATE FIRST_15_TEST_SECTIONS ts set PR_PRED =( N_PRED / TS_NUM_WORDS ) UPDATE FIRST_15_TEST_SECTIONS ts set PR_AUTO =( N_AUTO / TS_NUM_WORDS ) # ITE, proportion of detected ITE per participant # UPDATE VALID_PARTICIPANTS p set ITE_SWYPE =( SELECT AVG(ts.PR_SWYP) from FIRST_15_TEST_SECTIONS ts where ts.PARTICIPANT_ID = p.PARTICIPANT_ID ); UPDATE VALID_PARTICIPANTS p set ITE_PREDICT =( SELECT AVG(ts.PR_PRED) from FIRST_15_TEST_SECTIONS ts where ts.PARTICIPANT_ID = p.PARTICIPANT_ID ); UPDATE VALID_PARTICIPANTS p set ITE_AUTOCORR =( SELECT AVG(ts.PR_AUTO) from FIRST_15_TEST_SECTIONS ts where ts.PARTICIPANT_ID = p.PARTICIPANT_ID ); ####################### ###### Functions ###### ####################### DELIMITER $$ CREATE FUNCTION levenshtein( s1 VARCHAR(255), s2 VARCHAR(255) ) RETURNS INT DETERMINISTIC BEGIN DECLARE s1_len, s2_len, i, j, c, c_temp, cost INT; DECLARE s1_char CHAR; -- max strlen=255 DECLARE cv0, cv1 VARBINARY(256); SET s1_len = CHAR_LENGTH(s1), s2_len = CHAR_LENGTH(s2), cv1 = 0x00, j = 1, i = 1, c = 0; IF s1 = s2 THEN RETURN 0; ELSEIF s1_len = 0 THEN RETURN s2_len; ELSEIF s2_len = 0 THEN RETURN s1_len; ELSE WHILE j <= s2_len DO SET cv1 = CONCAT(cv1, UNHEX(HEX(j))), j = j + 1; END WHILE; WHILE i <= s1_len DO SET s1_char = SUBSTRING(s1, i, 1), c = i, cv0 = UNHEX(HEX(i)), j = 1; WHILE j <= s2_len DO SET c = c + 1; IF s1_char = SUBSTRING(s2, j, 1) THEN SET cost = 0; ELSE SET cost = 1; END IF; SET c_temp = CONV(HEX(SUBSTRING(cv1, j, 1)), 16, 10) + cost; IF c > c_temp THEN SET c = c_temp; END IF; SET c_temp = CONV(HEX(SUBSTRING(cv1, j+1, 1)), 16, 10) + 1; IF c > c_temp THEN SET c = c_temp; END IF; SET cv0 = CONCAT(cv0, UNHEX(HEX(c))), j = j + 1; END WHILE; SET cv1 = cv0, i = i + 1; END WHILE; END IF; RETURN c; END$$ CREATE FUNCTION levenshtein_ratio( s1 VARCHAR(255), s2 VARCHAR(255) ) RETURNS INT DETERMINISTIC BEGIN DECLARE s1_len, s2_len, max_len INT; SET s1_len = LENGTH(s1), s2_len = LENGTH(s2); IF s1_len > s2_len THEN SET max_len = s1_len; ELSE SET max_len = s2_len; END IF; RETURN ROUND((1 - LEVENSHTEIN(s1, s2) / max_len) * 100); END$$ DELIMITER ; DELIMITER $$ CREATE FUNCTION wordcount(str TEXT) RETURNS INT DETERMINISTIC SQL SECURITY INVOKER NO SQL BEGIN DECLARE wordCnt, idx, maxIdx INT DEFAULT 0; DECLARE currChar, prevChar BOOL DEFAULT 0; SET maxIdx=char_length(str); WHILE idx < maxIdx DO SET currChar=SUBSTRING(str, idx, 1) RLIKE '[[:alnum:]]'; IF NOT prevChar AND currChar THEN SET wordCnt=wordCnt+1; END IF; SET prevChar=currChar; SET idx=idx+1; END WHILE; RETURN wordCnt; END $$ DELIMITER ;